From: Michael R. Crusoe Date: Tue, 19 Nov 2024 11:44:45 +0000 (+0100) Subject: Import python-pcre2_0.4.0+ds.orig.tar.xz X-Git-Tag: archive/raspbian/0.6.0+ds-1+rpi1~4 X-Git-Url: https://dgit.raspbian.org/%22http:/www.example.com//%22ander.pijoan%40deusto.es/%22/%22http:/www.example.com/%22ander.pijoan%40deusto.es/%22?a=commitdiff_plain;h=f2b7df828ced1de1f4690094097b3c60c8055011;p=python-pcre2.git Import python-pcre2_0.4.0+ds.orig.tar.xz [dgit import orig python-pcre2_0.4.0+ds.orig.tar.xz] --- f2b7df828ced1de1f4690094097b3c60c8055011 diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..1b59b31 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,29 @@ +cmake_minimum_required(VERSION 3.7.2) + +project(pcre2) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +set(CMAKE_C_STANDARD 99) + +set(PCRE2_INCLUDE_DIR ${CMAKE_BINARY_DIR}/src/libpcre2) +set(CYTHON_EXTRA_COMPILE_ARGS -DPCRE2_CODE_UNIT_WIDTH=8 -fPIC) + +# Set PCRE2 options. +set(PCRE2_SUPPORT_JIT ON CACHE BOOL "" FORCE) +set(PCRE2_NEVER_BACKSLASH_C ON CACHE BOOL "" FORCE) + +# Always make a release build. +set(CMAKE_BUILD_TYPE Release) + +# Build PCRE2 library as both shared and static. +set(BUILD_STATIC_LIBS ON) +set(BUILD_SHARED_LIBS ON) +add_subdirectory(src/libpcre2) + +# Build Cython code as shared. +set(BUILD_STATIC_LIBS OFF) +set(BUILD_SHARED_LIBS ON) +add_subdirectory(src/pcre2) + +# Include PCRE2 header for Cython API. +install(FILES ${PCRE2_INCLUDE_DIR}/pcre2.h DESTINATION src/pcre2) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4a57011 --- /dev/null +++ b/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2022, grtetrault +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..eb2eb2a --- /dev/null +++ b/Makefile @@ -0,0 +1,27 @@ +SHELL = /bin/bash + +init: + git submodule update --init + python3 -m venv ./.venv + ./.venv/bin/pip install -r ./requirements/build-requirements.txt + ./.venv/bin/pip install -r ./requirements/test-requirements.txt + ./.venv/bin/pip install . + +build: + ./.venv/bin/pip install . --force-reinstall + +clean: + rm -rf ./dist + rm -rf ./build + rm -rf ./_skbuild + find ./src/pcre2 -type f -name '*.c' -print0 | xargs -0 rm -vf + find ./src/pcre2 -type f -name '*.html' -print0 | xargs -0 rm -vf + find . -type f -name '*.pyc' | xargs rm -r + find . -type d -name '*.egg-info' | xargs rm -r + find . -type d -name '*.ipynb_checkpoints' | xargs rm -r + +purge: + rm -rf ./.venv + +benchmark: + ./.venv/bin/python ./benchmarks/run_regex_redux.py diff --git a/PKG-INFO b/PKG-INFO new file mode 100644 index 0000000..0a6cd5b --- /dev/null +++ b/PKG-INFO @@ -0,0 +1,134 @@ +Metadata-Version: 2.1 +Name: pcre2 +Version: 0.4.0 +Summary: Python bindings for the PCRE2 regular expression library +Home-page: https://github.com/grtetrault/pcre2.py +Author: Garrett Tetrault +License: BSD 3-Clause License +Classifier: Development Status :: 3 - Alpha +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: BSD License +Classifier: Programming Language :: C +Classifier: Programming Language :: Cython +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Operating System :: MacOS :: MacOS X +Classifier: Operating System :: POSIX :: Linux +Classifier: Operating System :: Microsoft :: Windows +Description-Content-Type: text/markdown +License-File: LICENSE + +# PCRE2.py: Python bindings for the PCRE2 regular expression library + +This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2). +PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel. +For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2). + +## Installation + +From PyPI: +``` +pip install pcre2 +``` + +If a wheel is not available for your platform, the module will be built from source. +Building requires: + +* `cmake` +* C compiler toolchain, such as `gcc` and `make` +* `libtool` +* Python headers + +## Usage + +Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and bytes-like objects. +This returns a `Pattern` object. +Expressions can be compiled with a number of options (combined with the bitwise-or operator) and can be JIT compiled, + +```python +>>> import pcre2 +>>> expr = r'(?\w+)\s+(?\w+)' +>>> patn = pcre2.compile(expr, options=pcre2.I, jit=True) +>>> # Patterns can also be JIT compiled after initialization. +>>> patn.jit_compile() +``` + +Inspection of `Pattern` objects is done as follows, + +```python +>>> patn.jit_size +980 +>>> patn.name_dict() +{1: 'head', 2: 'tail'} +>>> patn.options +524296 +>>> # Deeper inspection into options is available. +>>> pcre2.CompileOption.decompose(patn.options) +[, ] +``` + +Once compiled, `Pattern` objects can be used to match against strings. +Matching return a `Match` object, which has several functions to view results, + +```python +>>> subj = 'foo bar buzz bazz' +>>> match = patn.match(subj) +>>> match.substring() +'foo bar' +>>> match.start(), match.end() +(8, 17) +``` + +Substitution is also supported, both from `Pattern` and `Match` objects, + +```python +>>> repl = '$2 $1' +>>> patn.substitute(repl, subj) # Global substitutions by default. +'bar foo bazz buzz' +>>> patn.substitute(repl, subj, suball=False) +'bar foo buzz bazz' +>>> match.expand(repl) +'bar foo buzz bazz' +``` + +Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches, + +```python +>>> for match in patn.scan(subj): +... print(match.substring('head')) +... +foo +buzz +``` + +## Performance + +PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled. +Below are the `regex-redux` benchmark results included in this repository, + +| Script | Number of runs | Total time | Real time | User time | System time | +| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- | +| `baseline.py` | 10 | 3.020 | 0.302 | 0.020 | 0.086 | +| `vanilla.py` | 10 | 51.380 | 5.138 | 11.408 | 0.529 | +| `hand_optimized.py` | 10 | 13.190 | 1.319 | 2.846 | 0.344 | +| `pcre2_module.py` | 10 | 13.670 | 1.367 | 2.269 | 0.532 | + +Script descriptions are as follows, + +| Script | Description | +| ------------------- | -------------------------------------------------------------------- | +| `baseline.py` | Reads input file and outputs stored expected output | +| `vanilla.py` | Pure Python version | +| `hand_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library | +| `pcre2_module.py` | Implementation using Python bindings written here | + +Tests were performed on an M2 Macbook Air. +Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset. +Additionally, a Python virtual environment must be created, and the package built +with `make init` and `make build` respectively. +For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html). +See source code of benchmark scripts for details and original sources. diff --git a/README.md b/README.md new file mode 100755 index 0000000..e207b84 --- /dev/null +++ b/README.md @@ -0,0 +1,110 @@ +# PCRE2.py: Python bindings for the PCRE2 regular expression library + +This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2). +PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel. +For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2). + +## Installation + +From PyPI: +``` +pip install pcre2 +``` + +If a wheel is not available for your platform, the module will be built from source. +Building requires: + +* `cmake` +* C compiler toolchain, such as `gcc` and `make` +* `libtool` +* Python headers + +## Usage + +Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and bytes-like objects. +This returns a `Pattern` object. +Expressions can be compiled with a number of options (combined with the bitwise-or operator) and can be JIT compiled, + +```python +>>> import pcre2 +>>> expr = r'(?\w+)\s+(?\w+)' +>>> patn = pcre2.compile(expr, options=pcre2.I, jit=True) +>>> # Patterns can also be JIT compiled after initialization. +>>> patn.jit_compile() +``` + +Inspection of `Pattern` objects is done as follows, + +```python +>>> patn.jit_size +980 +>>> patn.name_dict() +{1: 'head', 2: 'tail'} +>>> patn.options +524296 +>>> # Deeper inspection into options is available. +>>> pcre2.CompileOption.decompose(patn.options) +[, ] +``` + +Once compiled, `Pattern` objects can be used to match against strings. +Matching return a `Match` object, which has several functions to view results, + +```python +>>> subj = 'foo bar buzz bazz' +>>> match = patn.match(subj) +>>> match.substring() +'foo bar' +>>> match.start(), match.end() +(8, 17) +``` + +Substitution is also supported, both from `Pattern` and `Match` objects, + +```python +>>> repl = '$2 $1' +>>> patn.substitute(repl, subj) # Global substitutions by default. +'bar foo bazz buzz' +>>> patn.substitute(repl, subj, suball=False) +'bar foo buzz bazz' +>>> match.expand(repl) +'bar foo buzz bazz' +``` + +Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches, + +```python +>>> for match in patn.scan(subj): +... print(match.substring('head')) +... +foo +buzz +``` + +## Performance + +PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled. +Below are the `regex-redux` benchmark results included in this repository, + +| Script | Number of runs | Total time | Real time | User time | System time | +| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- | +| `baseline.py` | 10 | 3.020 | 0.302 | 0.020 | 0.086 | +| `vanilla.py` | 10 | 51.380 | 5.138 | 11.408 | 0.529 | +| `hand_optimized.py` | 10 | 13.190 | 1.319 | 2.846 | 0.344 | +| `pcre2_module.py` | 10 | 13.670 | 1.367 | 2.269 | 0.532 | + +Script descriptions are as follows, + +| Script | Description | +| ------------------- | -------------------------------------------------------------------- | +| `baseline.py` | Reads input file and outputs stored expected output | +| `vanilla.py` | Pure Python version | +| `hand_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library | +| `pcre2_module.py` | Implementation using Python bindings written here | + +Tests were performed on an M2 Macbook Air. +Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset. +Additionally, a Python virtual environment must be created, and the package built +with `make init` and `make build` respectively. +For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html). +See source code of benchmark scripts for details and original sources. diff --git a/pyproject.toml b/pyproject.toml new file mode 100755 index 0000000..c0f420a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,8 @@ +[build-system] +requires = [ + "setuptools>=42", + "scikit-build", + "Cython", + "cmake" +] +build-backend = "setuptools.build_meta" diff --git a/requirements/build-requirements.txt b/requirements/build-requirements.txt new file mode 100644 index 0000000..067a22d --- /dev/null +++ b/requirements/build-requirements.txt @@ -0,0 +1,6 @@ +requests +build +wheel +scikit-build +cmake +Cython \ No newline at end of file diff --git a/requirements/test-requirements.txt b/requirements/test-requirements.txt new file mode 100644 index 0000000..209b771 --- /dev/null +++ b/requirements/test-requirements.txt @@ -0,0 +1,3 @@ +twine +pytest +gitpython \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..8bfd5a1 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,4 @@ +[egg_info] +tag_build = +tag_date = 0 + diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..59534c6 --- /dev/null +++ b/setup.py @@ -0,0 +1,46 @@ +# -*- coding:utf-8 -*- + +import os +import skbuild +import setuptools + + +def get_long_desciption(): + cwd = os.path.abspath(os.path.dirname(__file__)) + filename = os.path.join(cwd, "README.md") + with open(filename) as f: + long_description = f.read() + + return long_description + + +skbuild.setup( + name = "pcre2", + version = "0.4.0", + description = "Python bindings for the PCRE2 regular expression library", + long_description = get_long_desciption(), + long_description_content_type = "text/markdown", + license = "BSD 3-Clause License", + author = "Garrett Tetrault", + url = "https://github.com/grtetrault/pcre2.py", + classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Programming Language :: C", + "Programming Language :: Cython", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Operating System :: MacOS :: MacOS X", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows" + ], + include_package_data=True, + packages = setuptools.find_packages("src"), + package_dir = {"": "src"}, + cmake_languages = "C", +) diff --git a/src/pcre2.egg-info/PKG-INFO b/src/pcre2.egg-info/PKG-INFO new file mode 100644 index 0000000..0a6cd5b --- /dev/null +++ b/src/pcre2.egg-info/PKG-INFO @@ -0,0 +1,134 @@ +Metadata-Version: 2.1 +Name: pcre2 +Version: 0.4.0 +Summary: Python bindings for the PCRE2 regular expression library +Home-page: https://github.com/grtetrault/pcre2.py +Author: Garrett Tetrault +License: BSD 3-Clause License +Classifier: Development Status :: 3 - Alpha +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: BSD License +Classifier: Programming Language :: C +Classifier: Programming Language :: Cython +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Operating System :: MacOS :: MacOS X +Classifier: Operating System :: POSIX :: Linux +Classifier: Operating System :: Microsoft :: Windows +Description-Content-Type: text/markdown +License-File: LICENSE + +# PCRE2.py: Python bindings for the PCRE2 regular expression library + +This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2). +PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel. +For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2). + +## Installation + +From PyPI: +``` +pip install pcre2 +``` + +If a wheel is not available for your platform, the module will be built from source. +Building requires: + +* `cmake` +* C compiler toolchain, such as `gcc` and `make` +* `libtool` +* Python headers + +## Usage + +Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and bytes-like objects. +This returns a `Pattern` object. +Expressions can be compiled with a number of options (combined with the bitwise-or operator) and can be JIT compiled, + +```python +>>> import pcre2 +>>> expr = r'(?\w+)\s+(?\w+)' +>>> patn = pcre2.compile(expr, options=pcre2.I, jit=True) +>>> # Patterns can also be JIT compiled after initialization. +>>> patn.jit_compile() +``` + +Inspection of `Pattern` objects is done as follows, + +```python +>>> patn.jit_size +980 +>>> patn.name_dict() +{1: 'head', 2: 'tail'} +>>> patn.options +524296 +>>> # Deeper inspection into options is available. +>>> pcre2.CompileOption.decompose(patn.options) +[, ] +``` + +Once compiled, `Pattern` objects can be used to match against strings. +Matching return a `Match` object, which has several functions to view results, + +```python +>>> subj = 'foo bar buzz bazz' +>>> match = patn.match(subj) +>>> match.substring() +'foo bar' +>>> match.start(), match.end() +(8, 17) +``` + +Substitution is also supported, both from `Pattern` and `Match` objects, + +```python +>>> repl = '$2 $1' +>>> patn.substitute(repl, subj) # Global substitutions by default. +'bar foo bazz buzz' +>>> patn.substitute(repl, subj, suball=False) +'bar foo buzz bazz' +>>> match.expand(repl) +'bar foo buzz bazz' +``` + +Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches, + +```python +>>> for match in patn.scan(subj): +... print(match.substring('head')) +... +foo +buzz +``` + +## Performance + +PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled. +Below are the `regex-redux` benchmark results included in this repository, + +| Script | Number of runs | Total time | Real time | User time | System time | +| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- | +| `baseline.py` | 10 | 3.020 | 0.302 | 0.020 | 0.086 | +| `vanilla.py` | 10 | 51.380 | 5.138 | 11.408 | 0.529 | +| `hand_optimized.py` | 10 | 13.190 | 1.319 | 2.846 | 0.344 | +| `pcre2_module.py` | 10 | 13.670 | 1.367 | 2.269 | 0.532 | + +Script descriptions are as follows, + +| Script | Description | +| ------------------- | -------------------------------------------------------------------- | +| `baseline.py` | Reads input file and outputs stored expected output | +| `vanilla.py` | Pure Python version | +| `hand_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library | +| `pcre2_module.py` | Implementation using Python bindings written here | + +Tests were performed on an M2 Macbook Air. +Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset. +Additionally, a Python virtual environment must be created, and the package built +with `make init` and `make build` respectively. +For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html). +See source code of benchmark scripts for details and original sources. diff --git a/src/pcre2.egg-info/SOURCES.txt b/src/pcre2.egg-info/SOURCES.txt new file mode 100644 index 0000000..391a133 --- /dev/null +++ b/src/pcre2.egg-info/SOURCES.txt @@ -0,0 +1,498 @@ +CMakeLists.txt +LICENSE +Makefile +README.md +pyproject.toml +setup.py +requirements/build-requirements.txt +requirements/test-requirements.txt +src/libpcre2/.bazelrc +src/libpcre2/.git +src/libpcre2/.gitignore +src/libpcre2/132html +src/libpcre2/AUTHORS +src/libpcre2/BUILD.bazel +src/libpcre2/CMakeLists.txt +src/libpcre2/COPYING +src/libpcre2/ChangeLog +src/libpcre2/CheckMan +src/libpcre2/CleanTxt +src/libpcre2/Detrail +src/libpcre2/HACKING +src/libpcre2/LICENCE +src/libpcre2/MODULE.bazel +src/libpcre2/Makefile.am +src/libpcre2/NEWS +src/libpcre2/NON-AUTOTOOLS-BUILD +src/libpcre2/PrepareRelease +src/libpcre2/README +src/libpcre2/README.md +src/libpcre2/RunGrepTest +src/libpcre2/RunGrepTest.bat +src/libpcre2/RunTest +src/libpcre2/RunTest.bat +src/libpcre2/WORKSPACE.bazel +src/libpcre2/autogen.sh +src/libpcre2/build.zig +src/libpcre2/config-cmake.h.in +src/libpcre2/configure.ac +src/libpcre2/index.md +src/libpcre2/libpcre2-16.pc.in +src/libpcre2/libpcre2-32.pc.in +src/libpcre2/libpcre2-8.pc.in +src/libpcre2/libpcre2-posix.pc.in +src/libpcre2/pcre2-config.in +src/libpcre2/pcre2_fuzzer.dict +src/libpcre2/pcre2_fuzzer.options +src/libpcre2/pcre2_fuzzer_16.dict +src/libpcre2/pcre2_fuzzer_16.options +src/libpcre2/pcre2_fuzzer_32.dict +src/libpcre2/pcre2_fuzzer_32.options +src/libpcre2/perltest.sh +src/libpcre2/.github/workflows/build.yml +src/libpcre2/.github/workflows/cifuzz.yml +src/libpcre2/.github/workflows/codeql.yml +src/libpcre2/.github/workflows/dev.yml +src/libpcre2/.github/workflows/scorecards.yml +src/libpcre2/cmake/COPYING-CMAKE-SCRIPTS +src/libpcre2/cmake/FindEditline.cmake +src/libpcre2/cmake/FindPackageHandleStandardArgs.cmake +src/libpcre2/cmake/FindReadline.cmake +src/libpcre2/cmake/pcre2-config-version.cmake.in +src/libpcre2/cmake/pcre2-config.cmake.in +src/libpcre2/doc/index.html.src +src/libpcre2/doc/pcre2-config.1 +src/libpcre2/doc/pcre2-config.txt +src/libpcre2/doc/pcre2.3 +src/libpcre2/doc/pcre2.txt +src/libpcre2/doc/pcre2_callout_enumerate.3 +src/libpcre2/doc/pcre2_code_copy.3 +src/libpcre2/doc/pcre2_code_copy_with_tables.3 +src/libpcre2/doc/pcre2_code_free.3 +src/libpcre2/doc/pcre2_compile.3 +src/libpcre2/doc/pcre2_compile_context_copy.3 +src/libpcre2/doc/pcre2_compile_context_create.3 +src/libpcre2/doc/pcre2_compile_context_free.3 +src/libpcre2/doc/pcre2_config.3 +src/libpcre2/doc/pcre2_convert_context_copy.3 +src/libpcre2/doc/pcre2_convert_context_create.3 +src/libpcre2/doc/pcre2_convert_context_free.3 +src/libpcre2/doc/pcre2_converted_pattern_free.3 +src/libpcre2/doc/pcre2_dfa_match.3 +src/libpcre2/doc/pcre2_general_context_copy.3 +src/libpcre2/doc/pcre2_general_context_create.3 +src/libpcre2/doc/pcre2_general_context_free.3 +src/libpcre2/doc/pcre2_get_error_message.3 +src/libpcre2/doc/pcre2_get_mark.3 +src/libpcre2/doc/pcre2_get_match_data_heapframes_size.3 +src/libpcre2/doc/pcre2_get_match_data_size.3 +src/libpcre2/doc/pcre2_get_ovector_count.3 +src/libpcre2/doc/pcre2_get_ovector_pointer.3 +src/libpcre2/doc/pcre2_get_startchar.3 +src/libpcre2/doc/pcre2_jit_compile.3 +src/libpcre2/doc/pcre2_jit_free_unused_memory.3 +src/libpcre2/doc/pcre2_jit_match.3 +src/libpcre2/doc/pcre2_jit_stack_assign.3 +src/libpcre2/doc/pcre2_jit_stack_create.3 +src/libpcre2/doc/pcre2_jit_stack_free.3 +src/libpcre2/doc/pcre2_maketables.3 +src/libpcre2/doc/pcre2_maketables_free.3 +src/libpcre2/doc/pcre2_match.3 +src/libpcre2/doc/pcre2_match_context_copy.3 +src/libpcre2/doc/pcre2_match_context_create.3 +src/libpcre2/doc/pcre2_match_context_free.3 +src/libpcre2/doc/pcre2_match_data_create.3 +src/libpcre2/doc/pcre2_match_data_create_from_pattern.3 +src/libpcre2/doc/pcre2_match_data_free.3 +src/libpcre2/doc/pcre2_pattern_convert.3 +src/libpcre2/doc/pcre2_pattern_info.3 +src/libpcre2/doc/pcre2_serialize_decode.3 +src/libpcre2/doc/pcre2_serialize_encode.3 +src/libpcre2/doc/pcre2_serialize_free.3 +src/libpcre2/doc/pcre2_serialize_get_number_of_codes.3 +src/libpcre2/doc/pcre2_set_bsr.3 +src/libpcre2/doc/pcre2_set_callout.3 +src/libpcre2/doc/pcre2_set_character_tables.3 +src/libpcre2/doc/pcre2_set_compile_extra_options.3 +src/libpcre2/doc/pcre2_set_compile_recursion_guard.3 +src/libpcre2/doc/pcre2_set_depth_limit.3 +src/libpcre2/doc/pcre2_set_glob_escape.3 +src/libpcre2/doc/pcre2_set_glob_separator.3 +src/libpcre2/doc/pcre2_set_heap_limit.3 +src/libpcre2/doc/pcre2_set_match_limit.3 +src/libpcre2/doc/pcre2_set_max_pattern_length.3 +src/libpcre2/doc/pcre2_set_max_varlookbehind.3 +src/libpcre2/doc/pcre2_set_newline.3 +src/libpcre2/doc/pcre2_set_offset_limit.3 +src/libpcre2/doc/pcre2_set_parens_nest_limit.3 +src/libpcre2/doc/pcre2_set_recursion_limit.3 +src/libpcre2/doc/pcre2_set_recursion_memory_management.3 +src/libpcre2/doc/pcre2_set_substitute_callout.3 +src/libpcre2/doc/pcre2_substitute.3 +src/libpcre2/doc/pcre2_substring_copy_byname.3 +src/libpcre2/doc/pcre2_substring_copy_bynumber.3 +src/libpcre2/doc/pcre2_substring_free.3 +src/libpcre2/doc/pcre2_substring_get_byname.3 +src/libpcre2/doc/pcre2_substring_get_bynumber.3 +src/libpcre2/doc/pcre2_substring_length_byname.3 +src/libpcre2/doc/pcre2_substring_length_bynumber.3 +src/libpcre2/doc/pcre2_substring_list_free.3 +src/libpcre2/doc/pcre2_substring_list_get.3 +src/libpcre2/doc/pcre2_substring_nametable_scan.3 +src/libpcre2/doc/pcre2_substring_number_from_name.3 +src/libpcre2/doc/pcre2api.3 +src/libpcre2/doc/pcre2build.3 +src/libpcre2/doc/pcre2callout.3 +src/libpcre2/doc/pcre2compat.3 +src/libpcre2/doc/pcre2convert.3 +src/libpcre2/doc/pcre2demo.3 +src/libpcre2/doc/pcre2grep.1 +src/libpcre2/doc/pcre2grep.txt +src/libpcre2/doc/pcre2jit.3 +src/libpcre2/doc/pcre2limits.3 +src/libpcre2/doc/pcre2matching.3 +src/libpcre2/doc/pcre2partial.3 +src/libpcre2/doc/pcre2pattern.3 +src/libpcre2/doc/pcre2perform.3 +src/libpcre2/doc/pcre2posix.3 +src/libpcre2/doc/pcre2sample.3 +src/libpcre2/doc/pcre2serialize.3 +src/libpcre2/doc/pcre2syntax.3 +src/libpcre2/doc/pcre2test.1 +src/libpcre2/doc/pcre2test.txt +src/libpcre2/doc/pcre2unicode.3 +src/libpcre2/doc/html/NON-AUTOTOOLS-BUILD.txt +src/libpcre2/doc/html/README.txt +src/libpcre2/doc/html/index.html +src/libpcre2/doc/html/pcre2-config.html +src/libpcre2/doc/html/pcre2.html +src/libpcre2/doc/html/pcre2_callout_enumerate.html +src/libpcre2/doc/html/pcre2_code_copy.html +src/libpcre2/doc/html/pcre2_code_copy_with_tables.html +src/libpcre2/doc/html/pcre2_code_free.html +src/libpcre2/doc/html/pcre2_compile.html +src/libpcre2/doc/html/pcre2_compile_context_copy.html +src/libpcre2/doc/html/pcre2_compile_context_create.html +src/libpcre2/doc/html/pcre2_compile_context_free.html +src/libpcre2/doc/html/pcre2_config.html +src/libpcre2/doc/html/pcre2_convert_context_copy.html +src/libpcre2/doc/html/pcre2_convert_context_create.html +src/libpcre2/doc/html/pcre2_convert_context_free.html +src/libpcre2/doc/html/pcre2_converted_pattern_free.html +src/libpcre2/doc/html/pcre2_dfa_match.html +src/libpcre2/doc/html/pcre2_general_context_copy.html +src/libpcre2/doc/html/pcre2_general_context_create.html +src/libpcre2/doc/html/pcre2_general_context_free.html +src/libpcre2/doc/html/pcre2_get_error_message.html +src/libpcre2/doc/html/pcre2_get_mark.html +src/libpcre2/doc/html/pcre2_get_match_data_heapframes_size.html +src/libpcre2/doc/html/pcre2_get_match_data_size.html +src/libpcre2/doc/html/pcre2_get_ovector_count.html +src/libpcre2/doc/html/pcre2_get_ovector_pointer.html +src/libpcre2/doc/html/pcre2_get_startchar.html +src/libpcre2/doc/html/pcre2_jit_compile.html +src/libpcre2/doc/html/pcre2_jit_free_unused_memory.html +src/libpcre2/doc/html/pcre2_jit_match.html +src/libpcre2/doc/html/pcre2_jit_stack_assign.html +src/libpcre2/doc/html/pcre2_jit_stack_create.html +src/libpcre2/doc/html/pcre2_jit_stack_free.html +src/libpcre2/doc/html/pcre2_maketables.html +src/libpcre2/doc/html/pcre2_maketables_free.html +src/libpcre2/doc/html/pcre2_match.html +src/libpcre2/doc/html/pcre2_match_context_copy.html +src/libpcre2/doc/html/pcre2_match_context_create.html +src/libpcre2/doc/html/pcre2_match_context_free.html +src/libpcre2/doc/html/pcre2_match_data_create.html +src/libpcre2/doc/html/pcre2_match_data_create_from_pattern.html +src/libpcre2/doc/html/pcre2_match_data_free.html +src/libpcre2/doc/html/pcre2_pattern_convert.html +src/libpcre2/doc/html/pcre2_pattern_info.html +src/libpcre2/doc/html/pcre2_serialize_decode.html +src/libpcre2/doc/html/pcre2_serialize_encode.html +src/libpcre2/doc/html/pcre2_serialize_free.html +src/libpcre2/doc/html/pcre2_serialize_get_number_of_codes.html +src/libpcre2/doc/html/pcre2_set_bsr.html +src/libpcre2/doc/html/pcre2_set_callout.html +src/libpcre2/doc/html/pcre2_set_character_tables.html +src/libpcre2/doc/html/pcre2_set_compile_extra_options.html +src/libpcre2/doc/html/pcre2_set_compile_recursion_guard.html +src/libpcre2/doc/html/pcre2_set_depth_limit.html +src/libpcre2/doc/html/pcre2_set_glob_escape.html +src/libpcre2/doc/html/pcre2_set_glob_separator.html +src/libpcre2/doc/html/pcre2_set_heap_limit.html +src/libpcre2/doc/html/pcre2_set_match_limit.html +src/libpcre2/doc/html/pcre2_set_max_pattern_length.html +src/libpcre2/doc/html/pcre2_set_max_varlookbehind.html +src/libpcre2/doc/html/pcre2_set_newline.html +src/libpcre2/doc/html/pcre2_set_offset_limit.html +src/libpcre2/doc/html/pcre2_set_parens_nest_limit.html +src/libpcre2/doc/html/pcre2_set_recursion_limit.html +src/libpcre2/doc/html/pcre2_set_recursion_memory_management.html +src/libpcre2/doc/html/pcre2_set_substitute_callout.html +src/libpcre2/doc/html/pcre2_substitute.html +src/libpcre2/doc/html/pcre2_substring_copy_byname.html +src/libpcre2/doc/html/pcre2_substring_copy_bynumber.html +src/libpcre2/doc/html/pcre2_substring_free.html +src/libpcre2/doc/html/pcre2_substring_get_byname.html +src/libpcre2/doc/html/pcre2_substring_get_bynumber.html +src/libpcre2/doc/html/pcre2_substring_length_byname.html +src/libpcre2/doc/html/pcre2_substring_length_bynumber.html +src/libpcre2/doc/html/pcre2_substring_list_free.html +src/libpcre2/doc/html/pcre2_substring_list_get.html +src/libpcre2/doc/html/pcre2_substring_nametable_scan.html +src/libpcre2/doc/html/pcre2_substring_number_from_name.html +src/libpcre2/doc/html/pcre2api.html +src/libpcre2/doc/html/pcre2build.html +src/libpcre2/doc/html/pcre2callout.html +src/libpcre2/doc/html/pcre2compat.html +src/libpcre2/doc/html/pcre2convert.html +src/libpcre2/doc/html/pcre2demo.html +src/libpcre2/doc/html/pcre2grep.html +src/libpcre2/doc/html/pcre2jit.html +src/libpcre2/doc/html/pcre2limits.html +src/libpcre2/doc/html/pcre2matching.html +src/libpcre2/doc/html/pcre2partial.html +src/libpcre2/doc/html/pcre2pattern.html +src/libpcre2/doc/html/pcre2perform.html +src/libpcre2/doc/html/pcre2posix.html +src/libpcre2/doc/html/pcre2sample.html +src/libpcre2/doc/html/pcre2serialize.html +src/libpcre2/doc/html/pcre2syntax.html +src/libpcre2/doc/html/pcre2test.html +src/libpcre2/doc/html/pcre2unicode.html +src/libpcre2/m4/ax_pthread.m4 +src/libpcre2/m4/pcre2_visibility.m4 +src/libpcre2/maint/GenerateCommon.py +src/libpcre2/maint/GenerateTest26.py +src/libpcre2/maint/GenerateUcd.py +src/libpcre2/maint/GenerateUcpHeader.py +src/libpcre2/maint/GenerateUcpTables.py +src/libpcre2/maint/ManyConfigTests +src/libpcre2/maint/README +src/libpcre2/maint/pcre2_chartables.c.non-standard +src/libpcre2/maint/ucptest.c +src/libpcre2/maint/utf8.c +src/libpcre2/maint/Unicode.tables/BidiMirroring.txt +src/libpcre2/maint/Unicode.tables/CaseFolding.txt +src/libpcre2/maint/Unicode.tables/DerivedBidiClass.txt +src/libpcre2/maint/Unicode.tables/DerivedCoreProperties.txt +src/libpcre2/maint/Unicode.tables/DerivedGeneralCategory.txt +src/libpcre2/maint/Unicode.tables/GraphemeBreakProperty.txt +src/libpcre2/maint/Unicode.tables/PropList.txt +src/libpcre2/maint/Unicode.tables/PropertyAliases.txt +src/libpcre2/maint/Unicode.tables/PropertyValueAliases.txt +src/libpcre2/maint/Unicode.tables/ScriptExtensions.txt +src/libpcre2/maint/Unicode.tables/Scripts.txt +src/libpcre2/maint/Unicode.tables/UnicodeData.txt +src/libpcre2/maint/Unicode.tables/emoji-data.txt +src/libpcre2/maint/ucptestdata/testinput1 +src/libpcre2/maint/ucptestdata/testinput2 +src/libpcre2/maint/ucptestdata/testoutput1 +src/libpcre2/maint/ucptestdata/testoutput2 +src/libpcre2/src/config.h.generic +src/libpcre2/src/config.h.in +src/libpcre2/src/pcre2.h.generic +src/libpcre2/src/pcre2.h.in +src/libpcre2/src/pcre2_auto_possess.c +src/libpcre2/src/pcre2_chartables.c.dist +src/libpcre2/src/pcre2_chkdint.c +src/libpcre2/src/pcre2_compile.c +src/libpcre2/src/pcre2_config.c +src/libpcre2/src/pcre2_context.c +src/libpcre2/src/pcre2_convert.c +src/libpcre2/src/pcre2_dfa_match.c +src/libpcre2/src/pcre2_dftables.c +src/libpcre2/src/pcre2_error.c +src/libpcre2/src/pcre2_extuni.c +src/libpcre2/src/pcre2_find_bracket.c +src/libpcre2/src/pcre2_fuzzsupport.c +src/libpcre2/src/pcre2_internal.h +src/libpcre2/src/pcre2_intmodedep.h +src/libpcre2/src/pcre2_jit_compile.c +src/libpcre2/src/pcre2_jit_match.c +src/libpcre2/src/pcre2_jit_misc.c +src/libpcre2/src/pcre2_jit_neon_inc.h +src/libpcre2/src/pcre2_jit_simd_inc.h +src/libpcre2/src/pcre2_jit_test.c +src/libpcre2/src/pcre2_maketables.c +src/libpcre2/src/pcre2_match.c +src/libpcre2/src/pcre2_match_data.c +src/libpcre2/src/pcre2_newline.c +src/libpcre2/src/pcre2_ord2utf.c +src/libpcre2/src/pcre2_pattern_info.c +src/libpcre2/src/pcre2_printint.c +src/libpcre2/src/pcre2_script_run.c +src/libpcre2/src/pcre2_serialize.c +src/libpcre2/src/pcre2_string_utils.c +src/libpcre2/src/pcre2_study.c +src/libpcre2/src/pcre2_substitute.c +src/libpcre2/src/pcre2_substring.c +src/libpcre2/src/pcre2_tables.c +src/libpcre2/src/pcre2_ucd.c +src/libpcre2/src/pcre2_ucp.h +src/libpcre2/src/pcre2_ucptables.c +src/libpcre2/src/pcre2_valid_utf.c +src/libpcre2/src/pcre2_xclass.c +src/libpcre2/src/pcre2demo.c +src/libpcre2/src/pcre2grep.c +src/libpcre2/src/pcre2posix.c +src/libpcre2/src/pcre2posix.h +src/libpcre2/src/pcre2posix_test.c +src/libpcre2/src/pcre2test.c +src/libpcre2/src/sljit/sljitConfig.h +src/libpcre2/src/sljit/sljitConfigCPU.h +src/libpcre2/src/sljit/sljitConfigInternal.h +src/libpcre2/src/sljit/sljitLir.c +src/libpcre2/src/sljit/sljitLir.h +src/libpcre2/src/sljit/sljitNativeARM_32.c +src/libpcre2/src/sljit/sljitNativeARM_64.c +src/libpcre2/src/sljit/sljitNativeARM_T2_32.c +src/libpcre2/src/sljit/sljitNativeLOONGARCH_64.c +src/libpcre2/src/sljit/sljitNativeMIPS_32.c +src/libpcre2/src/sljit/sljitNativeMIPS_64.c +src/libpcre2/src/sljit/sljitNativeMIPS_common.c +src/libpcre2/src/sljit/sljitNativePPC_32.c +src/libpcre2/src/sljit/sljitNativePPC_64.c +src/libpcre2/src/sljit/sljitNativePPC_common.c +src/libpcre2/src/sljit/sljitNativeRISCV_32.c +src/libpcre2/src/sljit/sljitNativeRISCV_64.c +src/libpcre2/src/sljit/sljitNativeRISCV_common.c +src/libpcre2/src/sljit/sljitNativeS390X.c +src/libpcre2/src/sljit/sljitNativeX86_32.c +src/libpcre2/src/sljit/sljitNativeX86_64.c +src/libpcre2/src/sljit/sljitNativeX86_common.c +src/libpcre2/src/sljit/sljitUtils.c +src/libpcre2/src/sljit/allocator_src/sljitExecAllocatorApple.c +src/libpcre2/src/sljit/allocator_src/sljitExecAllocatorCore.c +src/libpcre2/src/sljit/allocator_src/sljitExecAllocatorFreeBSD.c +src/libpcre2/src/sljit/allocator_src/sljitExecAllocatorPosix.c +src/libpcre2/src/sljit/allocator_src/sljitExecAllocatorWindows.c +src/libpcre2/src/sljit/allocator_src/sljitProtExecAllocatorNetBSD.c +src/libpcre2/src/sljit/allocator_src/sljitProtExecAllocatorPosix.c +src/libpcre2/src/sljit/allocator_src/sljitWXExecAllocatorPosix.c +src/libpcre2/src/sljit/allocator_src/sljitWXExecAllocatorWindows.c +src/libpcre2/testdata/grepbinary +src/libpcre2/testdata/grepfilelist +src/libpcre2/testdata/grepinput +src/libpcre2/testdata/grepinput3 +src/libpcre2/testdata/grepinput8 +src/libpcre2/testdata/grepinputC.bz2 +src/libpcre2/testdata/grepinputC.gz +src/libpcre2/testdata/grepinputM +src/libpcre2/testdata/grepinputv +src/libpcre2/testdata/grepinputx +src/libpcre2/testdata/greplist +src/libpcre2/testdata/grepnot.bz2 +src/libpcre2/testdata/grepoutput +src/libpcre2/testdata/grepoutput8 +src/libpcre2/testdata/grepoutputC +src/libpcre2/testdata/grepoutputCN +src/libpcre2/testdata/grepoutputCNU +src/libpcre2/testdata/grepoutputCU +src/libpcre2/testdata/grepoutputCbz2 +src/libpcre2/testdata/grepoutputCgz +src/libpcre2/testdata/grepoutputN +src/libpcre2/testdata/grepoutputUN +src/libpcre2/testdata/greppatN4 +src/libpcre2/testdata/testbtables +src/libpcre2/testdata/testinput1 +src/libpcre2/testdata/testinput10 +src/libpcre2/testdata/testinput11 +src/libpcre2/testdata/testinput12 +src/libpcre2/testdata/testinput13 +src/libpcre2/testdata/testinput14 +src/libpcre2/testdata/testinput15 +src/libpcre2/testdata/testinput16 +src/libpcre2/testdata/testinput17 +src/libpcre2/testdata/testinput18 +src/libpcre2/testdata/testinput19 +src/libpcre2/testdata/testinput2 +src/libpcre2/testdata/testinput20 +src/libpcre2/testdata/testinput21 +src/libpcre2/testdata/testinput22 +src/libpcre2/testdata/testinput23 +src/libpcre2/testdata/testinput24 +src/libpcre2/testdata/testinput25 +src/libpcre2/testdata/testinput26 +src/libpcre2/testdata/testinput3 +src/libpcre2/testdata/testinput4 +src/libpcre2/testdata/testinput5 +src/libpcre2/testdata/testinput6 +src/libpcre2/testdata/testinput7 +src/libpcre2/testdata/testinput8 +src/libpcre2/testdata/testinput9 +src/libpcre2/testdata/testinputEBC +src/libpcre2/testdata/testinputheap +src/libpcre2/testdata/testoutput1 +src/libpcre2/testdata/testoutput10 +src/libpcre2/testdata/testoutput11-16 +src/libpcre2/testdata/testoutput11-32 +src/libpcre2/testdata/testoutput12-16 +src/libpcre2/testdata/testoutput12-32 +src/libpcre2/testdata/testoutput13 +src/libpcre2/testdata/testoutput14-16 +src/libpcre2/testdata/testoutput14-32 +src/libpcre2/testdata/testoutput14-8 +src/libpcre2/testdata/testoutput15 +src/libpcre2/testdata/testoutput16 +src/libpcre2/testdata/testoutput17 +src/libpcre2/testdata/testoutput18 +src/libpcre2/testdata/testoutput19 +src/libpcre2/testdata/testoutput2 +src/libpcre2/testdata/testoutput20 +src/libpcre2/testdata/testoutput21 +src/libpcre2/testdata/testoutput22-16 +src/libpcre2/testdata/testoutput22-32 +src/libpcre2/testdata/testoutput22-8 +src/libpcre2/testdata/testoutput23 +src/libpcre2/testdata/testoutput24 +src/libpcre2/testdata/testoutput25 +src/libpcre2/testdata/testoutput26 +src/libpcre2/testdata/testoutput3 +src/libpcre2/testdata/testoutput3A +src/libpcre2/testdata/testoutput3B +src/libpcre2/testdata/testoutput4 +src/libpcre2/testdata/testoutput5 +src/libpcre2/testdata/testoutput6 +src/libpcre2/testdata/testoutput7 +src/libpcre2/testdata/testoutput8-16-2 +src/libpcre2/testdata/testoutput8-16-3 +src/libpcre2/testdata/testoutput8-16-4 +src/libpcre2/testdata/testoutput8-32-2 +src/libpcre2/testdata/testoutput8-32-3 +src/libpcre2/testdata/testoutput8-32-4 +src/libpcre2/testdata/testoutput8-8-2 +src/libpcre2/testdata/testoutput8-8-3 +src/libpcre2/testdata/testoutput8-8-4 +src/libpcre2/testdata/testoutput9 +src/libpcre2/testdata/testoutputEBC +src/libpcre2/testdata/testoutputheap-16 +src/libpcre2/testdata/testoutputheap-32 +src/libpcre2/testdata/testoutputheap-8 +src/libpcre2/testdata/valgrind-jit.supp +src/libpcre2/testdata/wintestinput3 +src/libpcre2/testdata/wintestoutput3 +src/pcre2/CMakeLists.txt +src/pcre2/__init__.py +src/pcre2/consts.pxd +src/pcre2/consts.pyx +src/pcre2/exceptions.pxd +src/pcre2/exceptions.pyx +src/pcre2/libpcre2.pxd +src/pcre2/match.pxd +src/pcre2/match.pyx +src/pcre2/methods.pxd +src/pcre2/methods.pyx +src/pcre2/pattern.pxd +src/pcre2/pattern.pyx +src/pcre2/scanner.pxd +src/pcre2/scanner.pyx +src/pcre2/utils.pxd +src/pcre2/utils.pyx +src/pcre2.egg-info/PKG-INFO +src/pcre2.egg-info/SOURCES.txt +src/pcre2.egg-info/dependency_links.txt +src/pcre2.egg-info/top_level.txt +tests/test_groups.py +tests/test_match.py +tests/test_pattern.py \ No newline at end of file diff --git a/src/pcre2.egg-info/dependency_links.txt b/src/pcre2.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/pcre2.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/src/pcre2.egg-info/top_level.txt b/src/pcre2.egg-info/top_level.txt new file mode 100644 index 0000000..92d5e6d --- /dev/null +++ b/src/pcre2.egg-info/top_level.txt @@ -0,0 +1 @@ +pcre2 diff --git a/src/pcre2/CMakeLists.txt b/src/pcre2/CMakeLists.txt new file mode 100644 index 0000000..38c117e --- /dev/null +++ b/src/pcre2/CMakeLists.txt @@ -0,0 +1,53 @@ +find_package(Cython MODULE REQUIRED) +find_package(PythonExtensions MODULE REQUIRED) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +# Build Cython with annotations. +set(CYTHON_ANNOTATE TRUE) + +# Macro to add Cython files as modules, configured to build with PCRE2. +macro(add_pyx_file filename) + add_cython_target(${filename} C PY3) + add_library(${filename} MODULE ${filename}) + python_extension_module(${filename}) + + target_link_libraries(${filename} pcre2-8-static) + target_include_directories(${filename} PRIVATE ${PCRE2_INCLUDE_DIR}) + target_compile_options(${filename} PRIVATE ${CYTHON_EXTRA_COMPILE_ARGS}) + + install(TARGETS ${filename} LIBRARY DESTINATION src/pcre2) +endmacro() + +# GLOB pattern is recommended against, +# https://cmake.org/cmake/help/v3.14/command/file.html?highlight=file#filesystem +add_pyx_file(consts) +add_pyx_file(exceptions) +add_pyx_file(match) +add_pyx_file(methods) +add_pyx_file(pattern) +add_pyx_file(scanner) +add_pyx_file(utils) + + +# Include .pyx and .pxd files in distribution for use by Cython API. +install( + FILES + consts.pxd + consts.pyx + exceptions.pxd + exceptions.pyx + libpcre2.pxd + match.pxd + match.pyx + methods.pxd + methods.pyx + pattern.pxd + pattern.pyx + scanner.pxd + scanner.pyx + utils.pxd + utils.pyx + DESTINATION + src/pcre2 +) \ No newline at end of file diff --git a/src/pcre2/__init__.py b/src/pcre2/__init__.py new file mode 100755 index 0000000..e08e5b5 --- /dev/null +++ b/src/pcre2/__init__.py @@ -0,0 +1,7 @@ +from .methods import compile, findall, match, scan, split, substitute +from .consts import ( + __libpcre2_version__, + CompileOption, + A, I, M, U, S, X +) +__version__ = "0.4.0" diff --git a/src/pcre2/consts.pxd b/src/pcre2/consts.pxd new file mode 100644 index 0000000..e69de29 diff --git a/src/pcre2/consts.pyx b/src/pcre2/consts.pyx new file mode 100644 index 0000000..0970dde --- /dev/null +++ b/src/pcre2/consts.pyx @@ -0,0 +1,89 @@ +# -*- coding:utf-8 -*- + +# Standard libraries. +from enum import IntEnum + +# Local imports. +from .utils cimport * +from .libpcre2 cimport * + + +__libpcre2_version__ = f"{PCRE2_MAJOR}.{PCRE2_MINOR}" + + +class MetaOption(IntEnum): + def __repr__(self): + return f"<{self.__class__.__name__}.{self._name_}: 0x{self._value_:x}>" + + @classmethod + def verify(cls, options): + """ Verify a number is composed of options. + """ + tmp = options + for opt in cls: + tmp ^= (opt & tmp) + return tmp == 0 + + + @classmethod + def decompose(cls, options): + """ Decompose a number into its component options, returning a list of + MetaOption enums that are components of the given options. Note that + left over bits are ignored, and veracity can not be determined from + the result. + """ + return [opt for opt in cls if (opt & options)] + + +class CompileOption(MetaOption): + """ Option bits to be used in pattern compilation. See the following PCRE2 + documentation for a brief overview of the relevant options: + http://pcre.org/current/doc/html/pcre2_compile.html + """ + + ALLOW_EMPTY_CLASS = PCRE2_ALLOW_EMPTY_CLASS + ALT_BSUX = PCRE2_ALT_BSUX + ALT_CIRCUMFLEX = PCRE2_ALT_CIRCUMFLEX + ALT_VERBNAMES = PCRE2_ALT_VERBNAMES + ANCHORED = PCRE2_ANCHORED + CASELESS = PCRE2_CASELESS + DOLLAR_ENDONLY = PCRE2_DOLLAR_ENDONLY + DOTALL = PCRE2_DOTALL + DUPNAMES = PCRE2_DUPNAMES + ENDANCHORED = PCRE2_ENDANCHORED + EXTENDED = PCRE2_EXTENDED + EXTENDED_MORE = PCRE2_EXTENDED_MORE + FIRSTLINE = PCRE2_FIRSTLINE + LITERAL = PCRE2_LITERAL + MATCH_UNSET_BACKREF = PCRE2_MATCH_UNSET_BACKREF + MULTILINE = PCRE2_MULTILINE + UCP = PCRE2_UCP + UNGREEDY = PCRE2_UNGREEDY + UTF = PCRE2_UTF + + +class BsrChar(IntEnum): + """ Indicator for what character(s) are denoted by `\r`. + """ + UNICODE = PCRE2_BSR_UNICODE + ANYCRLF = PCRE2_BSR_ANYCRLF + + +class NewlineChar(IntEnum): + """ Indicator for what character(s) denote a newline. + """ + CR = PCRE2_NEWLINE_CR + LF = PCRE2_NEWLINE_LF + CRLF = PCRE2_NEWLINE_CRLF + ANY = PCRE2_NEWLINE_ANY + ANYCRLF = PCRE2_NEWLINE_ANYCRLF + NUL = PCRE2_NEWLINE_NUL + + +# Shorthands +A = CompileOption.ANCHORED +I = CompileOption.CASELESS +M = CompileOption.MULTILINE +U = CompileOption.UTF +S = CompileOption.DOTALL +X = CompileOption.EXTENDED diff --git a/src/pcre2/exceptions.pxd b/src/pcre2/exceptions.pxd new file mode 100755 index 0000000..e69de29 diff --git a/src/pcre2/exceptions.pyx b/src/pcre2/exceptions.pyx new file mode 100755 index 0000000..f0ce9e3 --- /dev/null +++ b/src/pcre2/exceptions.pyx @@ -0,0 +1,63 @@ +# -*- coding:utf-8 -*- + +# Standard libraries. +from libc.stdint cimport uint8_t + + +# Local imports. +from .utils cimport * +from .libpcre2 cimport * + + +class LibraryError(Exception): + """ Catch all for other PCRE2 errors (e.g. bad option bits). + """ + + def __init__(self, errorcode, context_msg=""): + cdef uint8_t errormsg_buf[120] + get_error_message_rc = pcre2_get_error_message( + errorcode, + errormsg_buf, sizeof(errormsg_buf) + ) + + # Handle errors in fetching error message. + if get_error_message_rc == PCRE2_ERROR_NOMEMORY: + raise MemoryError() + elif get_error_message_rc < 0: + raise LibraryError( + get_error_message_rc, + context_msg=f"Could not retrieve message for error code {get_error_message_rc}." + ) + + msg = errormsg_buf.decode("utf-8").capitalize() + if context_msg: + msg = context_msg + ". " + msg + + super().__init__(msg) + self.errorcode = errorcode + + +class CompileError(LibraryError): + """ Raised when pattern is malformed or is otherwise unable to be + compiled. + """ + + def __init__(self, errorcode, context_msg=""): + if not (errorcode > 0): + raise ValueError("Compilation error codes are strictly positive") + + super().__init__(errorcode, context_msg=context_msg) + + +class MatchError(LibraryError): + """ Raised when no or partial match found. + """ + + def __init__(self, errorcode, context_msg=""): + if not (errorcode == PCRE2_ERROR_NOMATCH or errorcode == PCRE2_ERROR_PARTIAL): + raise ValueError( + f"Invalid error code '{errorcode}'. " + "Match error codes can only be of value PCRE2_ERROR_NOMATCH or PCRE2_ERROR_PARTIAL" + ) + + super().__init__(errorcode, context_msg=context_msg) diff --git a/src/pcre2/libpcre2.pxd b/src/pcre2/libpcre2.pxd new file mode 100755 index 0000000..1d95e9d --- /dev/null +++ b/src/pcre2/libpcre2.pxd @@ -0,0 +1,501 @@ +# -*- coding:utf-8 -*- + +from libc.stdint cimport uint8_t, uint32_t, int32_t + + +cdef extern from "pcre2.h": + cdef unsigned int PCRE2_MAJOR + cdef unsigned int PCRE2_MINOR + + # The following option bits can be passed to pcre2_compile(), + # pcre2_match(), or pcre2_dfa_match(). PCRE2_NO_UTF_CHECK affects only the + # function to which it is passed. Put these bits at the most significant + # end of the options word so others can be added next to them. + cdef unsigned int PCRE2_ANCHORED + cdef unsigned int PCRE2_NO_UTF_CHECK + cdef unsigned int PCRE2_ENDANCHORED + + # The following option bits can be passed only to pcre2_compile(). However, + # they may affect compilation, JIT compilation, and/or interpretive + # execution. The following tags indicate which: + # C alters what is compiled by pcre2_compile() + # J alters what is compiled by pcre2_jit_compile() + # M is inspected during pcre2_match() execution + # D is inspected during pcre2_dfa_match() execution + cdef unsigned int PCRE2_ALLOW_EMPTY_CLASS # C + cdef unsigned int PCRE2_ALT_BSUX # C + cdef unsigned int PCRE2_AUTO_CALLOUT # C + cdef unsigned int PCRE2_CASELESS # C + cdef unsigned int PCRE2_DOLLAR_ENDONLY # J M D + cdef unsigned int PCRE2_DOTALL # C + cdef unsigned int PCRE2_DUPNAMES # C + cdef unsigned int PCRE2_EXTENDED # C + cdef unsigned int PCRE2_FIRSTLINE # J M D + cdef unsigned int PCRE2_MATCH_UNSET_BACKREF # C J M + cdef unsigned int PCRE2_MULTILINE # C + cdef unsigned int PCRE2_NEVER_UCP # C + cdef unsigned int PCRE2_NEVER_UTF # C + cdef unsigned int PCRE2_NO_AUTO_CAPTURE # C + cdef unsigned int PCRE2_NO_AUTO_POSSESS # C + cdef unsigned int PCRE2_NO_DOTSTAR_ANCHOR # C + cdef unsigned int PCRE2_NO_START_OPTIMIZE # J M D + cdef unsigned int PCRE2_UCP # C J M D + cdef unsigned int PCRE2_UNGREEDY # C + cdef unsigned int PCRE2_UTF # C J M D + cdef unsigned int PCRE2_NEVER_BACKSLASH_C # C + cdef unsigned int PCRE2_ALT_CIRCUMFLEX # J M D + cdef unsigned int PCRE2_ALT_VERBNAMES # C + cdef unsigned int PCRE2_USE_OFFSET_LIMIT # J M D + cdef unsigned int PCRE2_EXTENDED_MORE # C + cdef unsigned int PCRE2_LITERAL # C + cdef unsigned int PCRE2_MATCH_INVALID_UTF # J M D + + # An additional compile options word is available in the compile context. + cdef unsigned int PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES # C + cdef unsigned int PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL # C + cdef unsigned int PCRE2_EXTRA_MATCH_WORD # C + cdef unsigned int PCRE2_EXTRA_MATCH_LINE # C + cdef unsigned int PCRE2_EXTRA_ESCAPED_CR_IS_LF # C + cdef unsigned int PCRE2_EXTRA_ALT_BSUX # C + cdef unsigned int PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK # C + + # These are for pcre2_jit_compile(). + cdef unsigned int PCRE2_JIT_COMPLETE # For full matching. + cdef unsigned int PCRE2_JIT_PARTIAL_SOFT + cdef unsigned int PCRE2_JIT_PARTIAL_HARD + cdef unsigned int PCRE2_JIT_INVALID_UTF + + # These are for pcre2_match(), pcre2_dfa_match(), pcre2_jit_match(), and + # pcre2_substitute(). Some are allowed only for one of the functions, and + # in these cases it is noted below. Note that PCRE2_ANCHORED, + # PCRE2_ENDANCHORED and PCRE2_NO_UTF_CHECK can also be passed to these + # functions (though pcre2_jit_match() ignores the latter since it bypasses + # all sanity checks). + cdef unsigned int PCRE2_NOTBOL + cdef unsigned int PCRE2_NOTEOL + cdef unsigned int PCRE2_NOTEMPTY # ) These two must be kept + cdef unsigned int PCRE2_NOTEMPTY_ATSTART # ) adjacent to each other. + cdef unsigned int PCRE2_PARTIAL_SOFT + cdef unsigned int PCRE2_PARTIAL_HARD + cdef unsigned int PCRE2_DFA_RESTART # pcre2_dfa_match() only + cdef unsigned int PCRE2_DFA_SHORTEST # pcre2_dfa_match() only + cdef unsigned int PCRE2_SUBSTITUTE_GLOBAL # pcre2_substitute() only + cdef unsigned int PCRE2_SUBSTITUTE_EXTENDED # pcre2_substitute() only + cdef unsigned int PCRE2_SUBSTITUTE_UNSET_EMPTY # pcre2_substitute() only + cdef unsigned int PCRE2_SUBSTITUTE_UNKNOWN_UNSET # pcre2_substitute() only + cdef unsigned int PCRE2_SUBSTITUTE_OVERFLOW_LENGTH # pcre2_substitute() only + cdef unsigned int PCRE2_NO_JIT # Not for pcre2_dfa_match() + cdef unsigned int PCRE2_COPY_MATCHED_SUBJECT + cdef unsigned int PCRE2_SUBSTITUTE_LITERAL # pcre2_substitute() only + cdef unsigned int PCRE2_SUBSTITUTE_MATCHED # pcre2_substitute() only + cdef unsigned int PCRE2_SUBSTITUTE_REPLACEMENT_ONLY # pcre2_substitute() only + + # Options for pcre2_pattern_convert(). + cdef unsigned int PCRE2_CONVERT_UTF + cdef unsigned int PCRE2_CONVERT_NO_UTF_CHECK + cdef unsigned int PCRE2_CONVERT_POSIX_BASIC + cdef unsigned int PCRE2_CONVERT_POSIX_EXTENDED + cdef unsigned int PCRE2_CONVERT_GLOB + cdef unsigned int PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR + cdef unsigned int PCRE2_CONVERT_GLOB_NO_STARSTAR + + # Newline and \R settings, for use in compile contexts. The newline values + # must be kept in step with values set in config.h and both sets must all + # be greater than zero. + cdef int PCRE2_NEWLINE_CR + cdef int PCRE2_NEWLINE_LF + cdef int PCRE2_NEWLINE_CRLF + cdef int PCRE2_NEWLINE_ANY + cdef int PCRE2_NEWLINE_ANYCRLF + cdef int PCRE2_NEWLINE_NUL + + cdef int PCRE2_BSR_UNICODE + cdef int PCRE2_BSR_ANYCRLF + + # Error codes for pcre2_compile(). Some of these are also used by + # pcre2_pattern_convert(). + cdef int PCRE2_ERROR_END_BACKSLASH + cdef int PCRE2_ERROR_END_BACKSLASH_C + cdef int PCRE2_ERROR_UNKNOWN_ESCAPE + cdef int PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER + cdef int PCRE2_ERROR_QUANTIFIER_TOO_BIG + cdef int PCRE2_ERROR_MISSING_SQUARE_BRACKET + cdef int PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS + cdef int PCRE2_ERROR_CLASS_RANGE_ORDER + cdef int PCRE2_ERROR_QUANTIFIER_INVALID + cdef int PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT + cdef int PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY + cdef int PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS + cdef int PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING + cdef int PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS + cdef int PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE + cdef int PCRE2_ERROR_NULL_PATTERN + cdef int PCRE2_ERROR_BAD_OPTIONS + cdef int PCRE2_ERROR_MISSING_COMMENT_CLOSING + cdef int PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP + cdef int PCRE2_ERROR_PATTERN_TOO_LARGE + cdef int PCRE2_ERROR_HEAP_FAILED + cdef int PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS + cdef int PCRE2_ERROR_INTERNAL_CODE_OVERFLOW + cdef int PCRE2_ERROR_MISSING_CONDITION_CLOSING + cdef int PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH + cdef int PCRE2_ERROR_ZERO_RELATIVE_REFERENCE + cdef int PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES + cdef int PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED + cdef int PCRE2_ERROR_BAD_RELATIVE_REFERENCE + cdef int PCRE2_ERROR_UNKNOWN_POSIX_CLASS + cdef int PCRE2_ERROR_INTERNAL_STUDY_ERROR + cdef int PCRE2_ERROR_UNICODE_NOT_SUPPORTED + cdef int PCRE2_ERROR_PARENTHESES_STACK_CHECK + cdef int PCRE2_ERROR_CODE_POINT_TOO_BIG + cdef int PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED + cdef int PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C + cdef int PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE + cdef int PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG + cdef int PCRE2_ERROR_MISSING_CALLOUT_CLOSING + cdef int PCRE2_ERROR_ESCAPE_INVALID_IN_VERB + cdef int PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P + cdef int PCRE2_ERROR_MISSING_NAME_TERMINATOR + cdef int PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME + cdef int PCRE2_ERROR_INVALID_SUBPATTERN_NAME + cdef int PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE + cdef int PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY + cdef int PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY + cdef int PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG + cdef int PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS + cdef int PCRE2_ERROR_CLASS_INVALID_RANGE + cdef int PCRE2_ERROR_OCTAL_BYTE_TOO_BIG + cdef int PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE + cdef int PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN + cdef int PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES + cdef int PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE + cdef int PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE + cdef int PCRE2_ERROR_BACKSLASH_G_SYNTAX + cdef int PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING + # Error 159 is obsolete and should now never occur + cdef int PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED + cdef int PCRE2_ERROR_VERB_UNKNOWN + cdef int PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG + cdef int PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED + cdef int PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW + cdef int PCRE2_ERROR_INVALID_OCTAL + cdef int PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH + cdef int PCRE2_ERROR_MARK_MISSING_ARGUMENT + cdef int PCRE2_ERROR_INVALID_HEXADECIMAL + cdef int PCRE2_ERROR_BACKSLASH_C_SYNTAX + cdef int PCRE2_ERROR_BACKSLASH_K_SYNTAX + cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS + cdef int PCRE2_ERROR_BACKSLASH_N_IN_CLASS + cdef int PCRE2_ERROR_CALLOUT_STRING_TOO_LONG + cdef int PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT + cdef int PCRE2_ERROR_UTF_IS_DISABLED + cdef int PCRE2_ERROR_UCP_IS_DISABLED + cdef int PCRE2_ERROR_VERB_NAME_TOO_LONG + cdef int PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG + cdef int PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS + cdef int PCRE2_ERROR_VERSION_CONDITION_SYNTAX + cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS + cdef int PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER + cdef int PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER + cdef int PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED + cdef int PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP + cdef int PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED + cdef int PCRE2_ERROR_PATTERN_TOO_COMPLICATED + cdef int PCRE2_ERROR_LOOKBEHIND_TOO_LONG + cdef int PCRE2_ERROR_PATTERN_STRING_TOO_LONG + cdef int PCRE2_ERROR_INTERNAL_BAD_CODE + cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP + cdef int PCRE2_ERROR_NO_SURROGATES_IN_UTF16 + cdef int PCRE2_ERROR_BAD_LITERAL_OPTIONS + cdef int PCRE2_ERROR_SUPPORTED_ONLY_IN_UNICODE + cdef int PCRE2_ERROR_INVALID_HYPHEN_IN_OPTIONS + cdef int PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN + cdef int PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE + cdef int PCRE2_ERROR_TOO_MANY_CAPTURES + cdef int PCRE2_ERROR_CONDITION_ATOMIC_ASSERTION_EXPECTED + cdef int PCRE2_ERROR_BACKSLASH_K_IN_LOOKAROUND + + # "Expected" matching error codes: no match and partial match. + cdef int PCRE2_ERROR_NOMATCH + cdef int PCRE2_ERROR_PARTIAL + + # Error codes for UTF-8 validity checks. + cdef int PCRE2_ERROR_UTF8_ERR1 + cdef int PCRE2_ERROR_UTF8_ERR2 + cdef int PCRE2_ERROR_UTF8_ERR3 + cdef int PCRE2_ERROR_UTF8_ERR4 + cdef int PCRE2_ERROR_UTF8_ERR5 + cdef int PCRE2_ERROR_UTF8_ERR6 + cdef int PCRE2_ERROR_UTF8_ERR7 + cdef int PCRE2_ERROR_UTF8_ERR8 + cdef int PCRE2_ERROR_UTF8_ERR9 + cdef int PCRE2_ERROR_UTF8_ERR10 + cdef int PCRE2_ERROR_UTF8_ERR11 + cdef int PCRE2_ERROR_UTF8_ERR12 + cdef int PCRE2_ERROR_UTF8_ERR13 + cdef int PCRE2_ERROR_UTF8_ERR14 + cdef int PCRE2_ERROR_UTF8_ERR15 + cdef int PCRE2_ERROR_UTF8_ERR16 + cdef int PCRE2_ERROR_UTF8_ERR17 + cdef int PCRE2_ERROR_UTF8_ERR18 + cdef int PCRE2_ERROR_UTF8_ERR19 + cdef int PCRE2_ERROR_UTF8_ERR20 + cdef int PCRE2_ERROR_UTF8_ERR21 + + # Error codes for UTF-16 validity checks. + cdef int PCRE2_ERROR_UTF16_ERR1 + cdef int PCRE2_ERROR_UTF16_ERR2 + cdef int PCRE2_ERROR_UTF16_ERR3 + + # Error codes for UTF-32 validity checks. + cdef int PCRE2_ERROR_UTF32_ERR1 + cdef int PCRE2_ERROR_UTF32_ERR2 + + # Miscellaneous error codes for pcre2[_dfa]_match(), substring extraction + # functions, context functions, and serializing functions. They are in + # numerical order. Originally they were in alphabetical order too, but now + # that PCRE2 is released, the numbers must not be changed. + cdef int PCRE2_ERROR_BADDATA + cdef int PCRE2_ERROR_MIXEDTABLES # Name was changed. + cdef int PCRE2_ERROR_BADMAGIC + cdef int PCRE2_ERROR_BADMODE + cdef int PCRE2_ERROR_BADOFFSET + cdef int PCRE2_ERROR_BADOPTION + cdef int PCRE2_ERROR_BADREPLACEMENT + cdef int PCRE2_ERROR_BADUTFOFFSET + cdef int PCRE2_ERROR_CALLOUT # Never used by PCRE2 itself. + cdef int PCRE2_ERROR_DFA_BADRESTART + cdef int PCRE2_ERROR_DFA_RECURSE + cdef int PCRE2_ERROR_DFA_UCOND + cdef int PCRE2_ERROR_DFA_UFUNC + cdef int PCRE2_ERROR_DFA_UITEM + cdef int PCRE2_ERROR_DFA_WSSIZE + cdef int PCRE2_ERROR_INTERNAL + cdef int PCRE2_ERROR_JIT_BADOPTION + cdef int PCRE2_ERROR_JIT_STACKLIMIT + cdef int PCRE2_ERROR_MATCHLIMIT + cdef int PCRE2_ERROR_NOMEMORY + cdef int PCRE2_ERROR_NOSUBSTRING + cdef int PCRE2_ERROR_NOUNIQUESUBSTRING + cdef int PCRE2_ERROR_NULL + cdef int PCRE2_ERROR_RECURSELOOP + cdef int PCRE2_ERROR_DEPTHLIMIT + cdef int PCRE2_ERROR_RECURSIONLIMIT # Obsolete synonym. + cdef int PCRE2_ERROR_UNAVAILABLE + cdef int PCRE2_ERROR_UNSET + cdef int PCRE2_ERROR_BADOFFSETLIMIT + cdef int PCRE2_ERROR_BADREPESCAPE + cdef int PCRE2_ERROR_REPMISSINGBRACE + cdef int PCRE2_ERROR_BADSUBSTITUTION + cdef int PCRE2_ERROR_BADSUBSPATTERN + cdef int PCRE2_ERROR_TOOMANYREPLACE + cdef int PCRE2_ERROR_BADSERIALIZEDDATA + cdef int PCRE2_ERROR_HEAPLIMIT + cdef int PCRE2_ERROR_CONVERT_SYNTAX + cdef int PCRE2_ERROR_INTERNAL_DUPMATCH + cdef int PCRE2_ERROR_DFA_UINVALID_UTF + + # Request types for pcre2_pattern_info(). + cdef int PCRE2_INFO_ALLOPTIONS + cdef int PCRE2_INFO_ARGOPTIONS + cdef int PCRE2_INFO_BACKREFMAX + cdef int PCRE2_INFO_BSR + cdef int PCRE2_INFO_CAPTURECOUNT + cdef int PCRE2_INFO_FIRSTCODEUNIT + cdef int PCRE2_INFO_FIRSTCODETYPE + cdef int PCRE2_INFO_FIRSTBITMAP + cdef int PCRE2_INFO_HASCRORLF + cdef int PCRE2_INFO_JCHANGED + cdef int PCRE2_INFO_JITSIZE + cdef int PCRE2_INFO_LASTCODEUNIT + cdef int PCRE2_INFO_LASTCODETYPE + cdef int PCRE2_INFO_MATCHEMPTY + cdef int PCRE2_INFO_MATCHLIMIT + cdef int PCRE2_INFO_MAXLOOKBEHIND + cdef int PCRE2_INFO_MINLENGTH + cdef int PCRE2_INFO_NAMECOUNT + cdef int PCRE2_INFO_NAMEENTRYSIZE + cdef int PCRE2_INFO_NAMETABLE + cdef int PCRE2_INFO_NEWLINE + cdef int PCRE2_INFO_DEPTHLIMIT + cdef int PCRE2_INFO_RECURSIONLIMIT # Obsolete synonym + cdef int PCRE2_INFO_SIZE + cdef int PCRE2_INFO_HASBACKSLASHC + cdef int PCRE2_INFO_FRAMESIZE + cdef int PCRE2_INFO_HEAPLIMIT + cdef int PCRE2_INFO_EXTRAOPTIONS + + # Request types for pcre2_config(). + cdef int PCRE2_CONFIG_BSR + cdef int PCRE2_CONFIG_JIT + cdef int PCRE2_CONFIG_JITTARGET + cdef int PCRE2_CONFIG_LINKSIZE + cdef int PCRE2_CONFIG_MATCHLIMIT + cdef int PCRE2_CONFIG_NEWLINE + cdef int PCRE2_CONFIG_PARENSLIMIT + cdef int PCRE2_CONFIG_DEPTHLIMIT + cdef int PCRE2_CONFIG_RECURSIONLIMIT # Obsolete synonym + cdef int PCRE2_CONFIG_STACKRECURSE # Obsolete + cdef int PCRE2_CONFIG_UNICODE + cdef int PCRE2_CONFIG_UNICODE_VERSION + cdef int PCRE2_CONFIG_VERSION + cdef int PCRE2_CONFIG_HEAPLIMIT + cdef int PCRE2_CONFIG_NEVER_BACKSLASH_C + cdef int PCRE2_CONFIG_COMPILED_WIDTHS + cdef int PCRE2_CONFIG_TABLES_LENGTH + + + # Opaque handles for PCRE2 defined structs. + ctypedef struct pcre2_code_t "pcre2_code": + pass + ctypedef struct pcre2_match_data_t "pcre2_match_data": + pass + ctypedef struct pcre2_general_context_t "pcre2_general_context": + pass + ctypedef struct pcre2_compile_context_t "pcre2_compile_context": + pass + ctypedef struct pcre2_match_context_t "pcre2_match_context": + pass + + # Basic string definition. Note that this assumes PCRE2 in compiled to + # support 8-bit strings. + ctypedef const uint8_t *pcre2_sptr_t "PCRE2_SPTR" + + + # Error handling functions. + int pcre2_get_error_message( + int errorcode, + uint8_t *buffer, + size_t bufflen + ) + + # Pattern compilation functions. + pcre2_code_t * pcre2_compile( + pcre2_sptr_t pattern, + size_t length, + uint32_t options, + int *errorcode, + size_t *erroroffset, + pcre2_compile_context_t *ccontext + ) + + int pcre2_jit_compile( + pcre2_code_t *code, + uint32_t options + ) + + + void pcre2_code_free(pcre2_code_t *code) + + # Information on compiled pattern. + int pcre2_pattern_info( + const pcre2_code_t *code, + uint32_t what, + void *where + ) + + int pcre2_substring_number_from_name( + const pcre2_code_t *code, + pcre2_sptr_t name + ) + + # Matching and match data functions. + pcre2_match_data_t * pcre2_match_data_create( + uint32_t ovecsize, + pcre2_general_context_t *gcontext + ) + + pcre2_match_data_t * pcre2_match_data_create_from_pattern( + const pcre2_code_t *code, + pcre2_general_context_t *gcontext + ) + + int pcre2_match( + const pcre2_code_t *code, + pcre2_sptr_t subject, + size_t length, + size_t startoffset, + uint32_t options, + pcre2_match_data_t *match_data, + pcre2_match_context_t *mcontext + ) + int pcre2_jit_match( + const pcre2_code_t *code, + pcre2_sptr_t subject, + size_t length, + size_t startoffset, + uint32_t options, + pcre2_match_data_t *match_data, + pcre2_match_context_t *mcontext + ) + + void pcre2_match_data_free(pcre2_match_data_t *match_data) + + uint32_t pcre2_get_ovector_count(pcre2_match_data_t *match_data) + + size_t *pcre2_get_ovector_pointer(pcre2_match_data_t *match_data) + + int pcre2_substring_nametable_scan( + const pcre2_code_t *code, + pcre2_sptr_t name, + pcre2_sptr_t *first, + pcre2_sptr_t *last + ) + + # String extraction from match data blocks. + int pcre2_substring_length_byname( + pcre2_match_data_t *match_data, + pcre2_sptr_t name, + size_t *bufflen + ) + + int pcre2_substring_get_byname( + pcre2_match_data_t *match_data, + pcre2_sptr_t name, + uint8_t **bufferptr, + size_t *bufflen + ) + + int pcre2_substring_length_bynumber( + pcre2_match_data_t *match_data, + uint32_t number, + size_t *bufflen + ) + + int pcre2_substring_get_bynumber( + pcre2_match_data_t *match_data, + uint32_t number, + uint8_t **bufferptr, + size_t *bufflen + ) + + # Substitution. + int pcre2_substitute( + const pcre2_code_t *code, + pcre2_sptr_t subject, + size_t length, + size_t startoffset, + uint32_t options, + pcre2_match_data_t *match_data, + pcre2_match_context_t *mcontext, + pcre2_sptr_t replacement, + size_t rlength, + uint8_t *outputbuffer, + size_t *outlengthptr + ) + + # Serialization. + int32_t pcre2_serialize_decode( + pcre2_code_t **codes, + int32_t number_of_codes, + const uint8_t *code_bytes, + pcre2_general_context_t *gcontex + ) + int32_t pcre2_serialize_encode( + pcre2_code_t **codes, + int32_t number_of_codes, + uint8_t **serialized_bytes, + size_t *serialized_size, + pcre2_general_context_t *gcontex + ) + void pcre2_serialize_free(uint8_t *bytes) diff --git a/src/pcre2/match.pxd b/src/pcre2/match.pxd new file mode 100644 index 0000000..76a61e2 --- /dev/null +++ b/src/pcre2/match.pxd @@ -0,0 +1,22 @@ +# -*- coding:utf-8 -*- + +# Standard libraries. +from cpython cimport Py_buffer +from libc.stdint cimport uint32_t + +# Local imports. +from .libpcre2 cimport * +from .pattern cimport Pattern + + +cdef class Match: + cdef pcre2_match_data_t *_mtch + cdef Pattern _pattern + cdef Py_buffer *_subj + cdef size_t _ofst # Byte offset, regardless of subject type. + cdef uint32_t _opts + + @staticmethod + cdef Match _from_data( + pcre2_match_data_t *mtch, Pattern pattern, Py_buffer *subj, size_t ofst, uint32_t opts + ) diff --git a/src/pcre2/match.pyx b/src/pcre2/match.pyx new file mode 100644 index 0000000..925a534 --- /dev/null +++ b/src/pcre2/match.pyx @@ -0,0 +1,259 @@ +# -*- coding:utf-8 -*- + +# Standard libraries. +from enum import IntEnum +from libc.stdint cimport uint32_t +from libc.stdlib cimport malloc, free +from cpython.unicode cimport PyUnicode_Check +cimport cython + +# Local imports. +from .utils cimport * +from .libpcre2 cimport * +from .pattern cimport Pattern + + +@cython.freelist(8) +cdef class Match: + """ + Object wrapper for a match block in PCRE2. Contains all relevant + information of a successful match. Attributes defined in match.pxd, see + below for an overview: + _mtch: Raw match data block, managed by PCRE2 + _pattern: Pattern object used in match + _subj: Subject the pattern was matched against + _ofst: Byte offset (egardless of subject type) used in match + _opts: Option bits used in match call + """ + + # =================================== # + # Lifetime management # + # =================================== # + + def __cinit__(self): + self._mtch = NULL + self._pattern = None + self._subj = NULL + self._opts = 0 + + + def __init__(self, *args, **kwargs): + # Prevent accidental instantiation from normal Python code since we + # cannot pass pointers into a Python constructor. + module = self.__class__.__module__ + qualname = self.__class__.__qualname__ + raise TypeError(f"Cannot create '{module}.{qualname}' instances") + + + def __dealloc__(self): + if self._subj is not NULL: + free_buffer(self._subj) + if self._mtch is not NULL: + pcre2_match_data_free(self._mtch) + + + @staticmethod + cdef Match _from_data( + pcre2_match_data_t *mtch, + Pattern pattern, + Py_buffer *subj, + size_t ofst, + uint32_t opts): + """ Factory function to create Match objects from C-type fields. The + ownership of the given pointers are stolen, which causes the extension + type to free them when the object is deallocated. + """ + + # Fast call to __new__() that bypasses the __init__() constructor. + cdef Match match = Match.__new__(Match) + match._mtch = mtch + match._pattern = pattern + match._subj = subj + match._ofst = ofst # Code unit offset + match._opts = opts + return match + + + # ========================== # + # Properties # + # ========================== # + + @property + def options(self): + return self._opts + + + @property + def subject(self): + return self._subj.obj + + + @property + def pattern(self): + return self._pattern + + + # ======================= # + # Methods # + # ======================= # + + def start(self, group=0): + """ Get the starting index of the matched substring, or of a specified + captured group. + """ + ovec_count = pcre2_get_ovector_count(self._mtch) + ovec_table = pcre2_get_ovector_pointer(self._mtch) + + cdef int grp_num + cdef pcre2_sptr_t first_entry + cdef pcre2_sptr_t last_entry + if isinstance(group, int): + grp_num = group + else: + grp_name = get_buffer(group) + pcre2_substring_nametable_scan( + self._pattern._code, grp_name.buf, &first_entry, &last_entry + ) + grp_num = (first_entry[0] << 8) | first_entry[1] + if grp_num < 0: + raise_from_rc(grp_num, None) + free_buffer(grp_name) + + if grp_num > ovec_count: + raise ValueError("Group referenced out of bounds") + start = ovec_table[2 * grp_num] + + # Convert to code unit index as necessary. + if PyUnicode_Check(self._subj.obj): + _, start = codeunit_to_codepoint(self._subj, start, 0, 0) + + return start + + + def end(self, group=0): + """ Get the ending index of the matched substring, or of a specified + captured group. + """ + ovec_count = pcre2_get_ovector_count(self._mtch) + ovec_table = pcre2_get_ovector_pointer(self._mtch) + + cdef int grp_num + cdef pcre2_sptr_t first_entry + cdef pcre2_sptr_t last_entry + if isinstance(group, int): + grp_num = group + else: + grp_name = get_buffer(group) + pcre2_substring_nametable_scan( + self._pattern._code, grp_name.buf, &first_entry, &last_entry + ) + grp_num = (first_entry[0] << 8) | first_entry[1] + if grp_num < 0: + raise_from_rc(grp_num, None) + free_buffer(grp_name) + + if grp_num > ovec_count: + raise ValueError("Group referenced out of bounds.") + end = ovec_table[2 * grp_num + 1] + + # Convert to code unit index as necessary. + if PyUnicode_Check(self._subj.obj): + _, end = codeunit_to_codepoint(self._subj, end, 0, 0) + + return end + + + def substring(self, group=0, default=""): + """ Get the full matched substring, or that of a specified captured + group. + """ + cdef uint8_t *res + cdef size_t res_len + if isinstance(group, int): + grp_num = group + + # Handle unset matches and return default if none found + is_substring_set = pcre2_substring_length_bynumber(self._mtch, grp_num, NULL) + if is_substring_set < 0: + return default + + get_rc = pcre2_substring_get_bynumber(self._mtch, grp_num, &res, &res_len) + if get_rc < 0: + raise_from_rc(get_rc, None) + else: + grp_name = get_buffer(group) + + # Handle unset matches and return default if none found + is_substring_set = pcre2_substring_length_byname( + self._mtch, grp_name.buf, NULL + ) + if is_substring_set < 0: + return default + + get_rc = pcre2_substring_get_byname( + self._mtch, grp_name.buf, &res, &res_len + ) + if get_rc < 0: + raise_from_rc(get_rc, None) + free_buffer(grp_name) + + # Clean up result and convert to unicode as appropriate. + result = (res)[:res_len] + result = result.strip(b"\x00") + if PyUnicode_Check(self._subj.obj): + result = result.decode("utf-8") + + return result + + + def __getitem__(self, group): + """ Alias to substring. + """ + return self.substring(group) + + + def expand(self, replacement, offset=0, options=0, low_memory=False): + """ Equivalent to calling substitute with the provided match. The type + of the subject determines the type of the returned string. + """ + is_subj_utf = PyUnicode_Check(self._subj.obj) + is_repl_utf = PyUnicode_Check(replacement) + if is_subj_utf ^ is_repl_utf: + subj_type = "string" if is_subj_utf else "bytes-like" + repl_type = "string" if is_repl_utf else "bytes-like" + raise ValueError(f"Cannot use a {subj_type} subject with a {repl_type} replacement") + + # Convert Python objects to C strings. + repl = get_buffer(replacement) + cdef size_t obj_ofst = offset + cdef size_t ofst = obj_ofst + cdef uint32_t opts = options | PCRE2_SUBSTITUTE_MATCHED + if is_subj_utf: + ofst, obj_ofst = codepoint_to_codeunit(self._subj, obj_ofst, 0, 0) + + cdef size_t res_buf_len = 0 + if not low_memory: + res_buf_len = self._subj.len + (self._subj.len // 2) + + cdef int rc = 0 + res, res_len = Pattern._substitute( + self._pattern._code, repl, self._subj, res_buf_len, ofst, opts, self._mtch, &rc + ) + if res is NULL: + raise_from_rc(rc, None) + + # Clean up result and convert to unicode as appropriate. + result = (res)[:res_len] + result = result.strip(b"\x00") + if is_subj_utf: + result = result.decode("utf-8") + + free(res) + free_buffer(repl) + return result + + def groups(self, default=""): + """ Return a tuple containing all the subgroups of the match, from 1 up to however many + groups are in the pattern. + """ + return tuple(self.substring(g, default=default) for g in range(self.pattern.capture_count)) diff --git a/src/pcre2/methods.pxd b/src/pcre2/methods.pxd new file mode 100644 index 0000000..e69de29 diff --git a/src/pcre2/methods.pyx b/src/pcre2/methods.pyx new file mode 100644 index 0000000..2dab35e --- /dev/null +++ b/src/pcre2/methods.pyx @@ -0,0 +1,94 @@ +# -*- coding:utf-8 -*- + +# Standard libraries. +from libc.stdint cimport uint32_t +from cpython cimport Py_buffer +from cpython.unicode cimport PyUnicode_Check + +# Local imports. +from .utils cimport * +from .libpcre2 cimport * +from .pattern cimport Pattern +from .match cimport Match + + +def compile(pattern, options=0, jit=False): + """ Factory function to compile regular expressions into Pattern objects. + See the following PCRE2 documentation for a brief overview of the relevant + options: + http://pcre.org/current/doc/html/pcre2_compile.html + """ + + cdef Py_buffer *patn = get_buffer(pattern) + cdef uint32_t opts = options + + # Ensure unicode strings are processed with UTF-8 support. + if PyUnicode_Check(pattern): + opts = opts | PCRE2_UTF + + cdef int compile_rc + cdef size_t compile_errpos + cdef pcre2_code_t *code = pcre2_compile( + patn.buf, patn.len, opts, &compile_rc, &compile_errpos, NULL + ) + + if code is NULL: + # If source was a unicode string, use the code point offset. + if PyUnicode_Check(pattern): + _, compile_errpos = codeunit_to_codepoint(patn, compile_errpos, 0, 0) + additional_msg = f"Compilation failed at position {compile_errpos!r}" + raise_from_rc(compile_rc, additional_msg) + + pattern_obj = Pattern._from_data(code, patn, opts) + if jit: + pattern_obj.jit_compile() + return pattern_obj + + +def findall(pattern, subject, offset=0, options=0, jit=True): + """ Shorthand for compiling a pattern, then calling findall. Note that this + will use JIT compilation. + """ + return compile(pattern, options=options, jit=jit).findall(subject, offset=offset) + + +def match(pattern, subject, offset=0, options=0, jit=False): + """ Shorthand for compiling a pattern, then calling match. + """ + return compile(pattern, options=options, jit=jit).match(subject, offset=offset) + + +def scan(pattern, subject, offset=0, options=0, jit=True): + """ Shorthand for compiling a pattern, then calling scan. Note that this + will use JIT compilation. + """ + return compile(pattern, options=options, jit=jit).scan(subject, offset=offset) + + +def split(pattern, subject, maxsplit=0, offset=0, options=0, jit=True): + """ Shorthand for compiling a pattern, then calling split. Note that this + will use JIT compilation. + """ + pattern_obj = compile(pattern, options=options, jit=jit) + return pattern_obj.split(subject, maxsplit=maxsplit, offset=offset) + + +def substitute( + pattern, + replacement, + subject, + offset=0, + suball=True, + literal=False, + low_memory=False, + options=0, + jit=True +): + """ Shorthand for compiling a pattern, then calling substitute. + """ + pattern_obj = compile(pattern, options=options, jit=jit) + if suball: + pattern_obj.jit_compile() + return pattern_obj.substitute( + replacement, subject, offset=offset, suball=suball, literal=literal, low_memory=low_memory + ) diff --git a/src/pcre2/pattern.pxd b/src/pcre2/pattern.pxd new file mode 100644 index 0000000..258e138 --- /dev/null +++ b/src/pcre2/pattern.pxd @@ -0,0 +1,38 @@ +# -*- coding:utf-8 -*- + +# Standard libraries. +from cpython cimport Py_buffer +from libc.stdint cimport uint32_t + +# Local imports. +from .libpcre2 cimport * + + +cdef class Pattern: + cdef pcre2_code_t *_code + cdef Py_buffer *_patn + cdef uint32_t _opts + cdef bint _jitc + + @staticmethod + cdef Pattern _from_data( + pcre2_code_t *code, Py_buffer *patn, uint32_t opts + ) + + @staticmethod + cdef uint32_t _info_uint(pcre2_code_t *code, uint32_t what) except * + @staticmethod + cdef size_t _info_size(pcre2_code_t *code, uint32_t what) except * + @staticmethod + cdef bint _info_bint(pcre2_code_t *code, uint32_t what) except * + + @staticmethod + cdef pcre2_match_data_t * _match( + pcre2_code_t *code, Py_buffer *subj, size_t ofst, uint32_t opts, int *rc + ) + + @staticmethod + cdef (uint8_t *, size_t) _substitute( + pcre2_code_t *code, Py_buffer *repl, Py_buffer *subj, size_t res_buf_len, + size_t ofst, uint32_t opts, pcre2_match_data_t *mtch, int *rc + ) diff --git a/src/pcre2/pattern.pyx b/src/pcre2/pattern.pyx new file mode 100644 index 0000000..66ce24b --- /dev/null +++ b/src/pcre2/pattern.pyx @@ -0,0 +1,485 @@ +# -*- coding:utf-8 -*- + +# Standard libraries. +from libc.stdint cimport uint32_t +from libc.stdlib cimport malloc, free +from cpython cimport Py_buffer +from cpython cimport array +from cpython.unicode cimport PyUnicode_Check +from cpython.memoryview cimport PyMemoryView_FromMemory + +# Local imports. +from .utils cimport * +from .libpcre2 cimport * +from .match cimport Match +from .scanner cimport Scanner +from .consts import BsrChar, NewlineChar + + +def _rebuild(pattern, code_bytes_obj, options): + """ Deserializes code object to allow for unpickling. + """ + patn = get_buffer(pattern) + opts = options + code_buf = get_buffer(code_bytes_obj) + + cdef pcre2_code_t *code + number_of_codes = pcre2_serialize_decode(&code, 1, code_buf.buf, NULL) + if number_of_codes < 0: + raise_from_rc(number_of_codes, None) + + return Pattern._from_data(code, patn, opts) + + +cdef class Pattern: + """ + Object wrapper for a compiled pattern (known as a code struct) in PCRE2. + Attributes defined in pattern.pxd, see below for an overview: + _code: Raw compiled pattern, managed by PCRE2 + _patn: Python object passed to compile + _opts: Option bits passed to compile call + _jitc: Indicator if pattern was JIT compiled + """ + + # =================================== # + # Lifetime management # + # =================================== # + + def __cinit__(self): + self._code = NULL + self._patn = NULL + self._opts = 0 + self._jitc = False + + + def __init__(self, *args, **kwargs): + # Prevent accidental instantiation from normal Python code since we + # cannot pass pointers into a Python constructor. + module = self.__class__.__module__ + qualname = self.__class__.__qualname__ + raise TypeError(f"Cannot create '{module}.{qualname}' instances") + + + def __dealloc__(self): + if self._patn is not NULL: + free_buffer(self._patn) + if self._code is not NULL: + pcre2_code_free(self._code) + + + @staticmethod + cdef Pattern _from_data(pcre2_code_t *code, Py_buffer *patn, uint32_t opts): + """ Factory function to create Pattern objects from C-type fields. The + ownership of the given pointers are stolen, which causes the extension + type to free them when the object is deallocated. + """ + # Fast call to __new__() that bypasses the __init__() constructor. + cdef Pattern pattern = Pattern.__new__(Pattern) + pattern._code = code + pattern._patn = patn + pattern._opts = opts + return pattern + + + # ========================================= # + # Serialize and deserialize # + # ========================================= # + + def __reduce__(self): + """ Serializes code object to allow for pickling. + """ + cdef uint8_t *code_bytes + cdef size_t code_count + serialize_rc = pcre2_serialize_encode( + &self._code, 1, &code_bytes, &code_count, NULL + ) + if serialize_rc < 0: + raise_from_rc(serialize_rc, None) + + return (_rebuild, (self._patn.obj, code_bytes[:code_count], self._opts)) + + + # =================================== # + # Pattern information # + # =================================== # + + @staticmethod + cdef uint32_t _info_uint(pcre2_code_t *code, uint32_t what) except *: + """ Safely access pattern info returned as uint32_t. + """ + cdef uint32_t where + pattern_info_rc = pcre2_pattern_info(code, what, &where) + if pattern_info_rc < 0: + raise_from_rc(pattern_info_rc, None) + return where + + @staticmethod + cdef size_t _info_size(pcre2_code_t *code, uint32_t what) except *: + """ Safely access pattern info returned as size_t. + """ + cdef size_t where + pattern_info_rc = pcre2_pattern_info(code, what, &where) + if pattern_info_rc < 0: + raise_from_rc(pattern_info_rc, None) + return where + + @staticmethod + cdef bint _info_bint(pcre2_code_t *code, uint32_t what) except *: + """ Safely access pattern info returned as bint. + """ + cdef bint where + pattern_info_rc = pcre2_pattern_info(code, what, &where) + if pattern_info_rc < 0: + raise_from_rc(pattern_info_rc, None) + return where + + + @property + def pattern(self): + """ Return the pattern the object was compiled with. + """ + return self._patn.obj + + + @property + def options(self): + """ Returns the compile options as modified by any top-level (*XXX) + option settings such as (*UTF) at the start of the pattern itself. + """ + return Pattern._info_uint(self._code, PCRE2_INFO_ALLOPTIONS) + + + @property + def backslash_r(self): + """ Return an indicator to what character sequences the \R escape + sequence matches. + """ + bsr = Pattern._info_uint(self._code, PCRE2_INFO_BSR) + return BsrChar(bsr) + + + @property + def capture_count(self): + """ Returns the highest capture group number in the pattern. In + patterns where `(?|` is not used, this is also the total number of + capture groups. + """ + return Pattern._info_uint(self._code, PCRE2_INFO_CAPTURECOUNT) + + + @property + def jit_size(self): + """ If the compiled pattern was successfully JIT compiled, return the + size of the JIT compiled code, otherwise return zero. + """ + return Pattern._info_size(self._code, PCRE2_INFO_JITSIZE) + + @property + def min_length(self): + """ Returns the minimum number of characters of matching subject strings. + """ + return Pattern._info_uint(self._code, PCRE2_INFO_MINLENGTH) + + + @property + def name_count(self): + """ Returns the number of named capture groups. + """ + return Pattern._info_uint(self._code, PCRE2_INFO_NAMECOUNT) + + + @property + def newline(self): + """ Returns the type of character sequence that will be recognized as + a newline while matching. + """ + newline = Pattern._info_uint(self._code, PCRE2_INFO_NEWLINE) + return NewlineChar(newline) + + + @property + def size(self): + """ Returns the size of the compiled pattern in bytes. + """ + return Pattern._info_size(self._code, PCRE2_INFO_SIZE) + + + def name_dict(self): + """ Returns a mapping from capture group number to capture group name. + """ + # Get name table related information. + name_count = Pattern._info_uint(self._code, PCRE2_INFO_NAMECOUNT) + name_entry_size = Pattern._info_uint(self._code, PCRE2_INFO_NAMEENTRYSIZE) + + cdef pcre2_sptr_t name_table + pattern_info_rc = pcre2_pattern_info(self._code, PCRE2_INFO_NAMETABLE, &name_table) + if pattern_info_rc < 0: + raise_from_rc(pattern_info_rc, None) + + # Convert byte table to dictionary. + name_dict = {} + cdef uint32_t i + for i in range(name_count): + offset = i * name_entry_size + + # First two bytes of name table contain index, followed by possibly + # unicode byte string. + entry_idx = int((name_table[offset] << 8) | name_table[offset + 1]) + entry_name = name_table[offset + 2:offset + name_entry_size] + + # Clean up entry and convert to unicode as appropriate. + entry_name = entry_name.strip(b"\x00") + if PyUnicode_Check(self._patn.obj): + entry_name = entry_name.decode("utf-8") + + name_dict[entry_idx] = entry_name + + return name_dict + + + # ======================= # + # Methods # + # ======================= # + + def jit_compile(self): + """ JIT compile the pattern. + """ + jit_compile_rc = pcre2_jit_compile(self._code, PCRE2_JIT_COMPLETE) + if jit_compile_rc < 0: + raise_from_rc(jit_compile_rc, None) + self._jitc = True + + + @staticmethod + cdef pcre2_match_data_t * _match( + pcre2_code_t *code, + Py_buffer *subj, + size_t ofst, + uint32_t opts, + int *rc): + """ Safe wrapper around calling PCRE2 function directly. + """ + # Allocate memory for match. + mtch = pcre2_match_data_create_from_pattern(code, NULL) + if mtch is NULL: + rc[0] = PCRE2_ERROR_NOMEMORY + return NULL + + # Attempt match of pattern onto subject. + rc[0] = pcre2_match( + code, subj.buf, subj.len, + ofst, opts, mtch, NULL + ) + return mtch + + + def findall(self, subject, offset=0): + """ + Return all non-overlapping matches of our pattern in subject, as a list of strings or tuples. + + The string is scanned left-to-right, and matches are returned in the + order found. Empty matches are included in the result. + + The result depends on the number of capturing groups in the pattern. + If there are no groups, return a list of strings matching the whole + pattern. If there is exactly one group, return a list of strings + matching that group. If multiple groups are present, return a list of + tuples of strings matching the groups. Non-capturing groups do not + affect the form of the result. + """ + matches = self.scan(subject, offset=offset) + if self.capture_count == 0: + return [m.substring() for m in matches] + elif self.capture_count == 1: + return [m.substring(1) for m in matches] + result = [] + for m in matches: + result.append(tuple(m.substring(g) for g in range(self.capture_count))) + return result + + + def match(self, subject, offset=0): + """ If match exists, returns the corresponding Match object. Otherwise + a MatchError is thrown in the case of no matches. See the following + PCRE2 documentation for a brief overview of the relevant options: + http://pcre.org/current/doc/html/pcre2_match.html + """ + cdef bint is_patn_utf = PyUnicode_Check(self._patn.obj) + cdef bint is_subj_utf = PyUnicode_Check(subject) + if is_patn_utf ^ is_subj_utf: + patn_type = "string" if is_patn_utf else "bytes-like" + subj_type = "string" if is_subj_utf else "bytes-like" + raise ValueError(f"Cannot use a {patn_type} pattern with a {subj_type} subject") + + cdef Py_buffer *subj = get_buffer(subject) + cdef size_t obj_ofst = offset + cdef size_t ofst = obj_ofst + cdef uint32_t opts = 0 + + # Convert indices accordingly. + if is_subj_utf: + ofst, obj_ofst = codepoint_to_codeunit(subj, obj_ofst, 0, 0) + + cdef int match_rc = 0 + mtch = Pattern._match(self._code, subj, ofst, opts, &match_rc) + if match_rc < 0: + raise_from_rc(match_rc, None) + + return Match._from_data(mtch, self, subj, ofst, opts) + + + def scan(self, subject, offset=0): + """ Returns iterator over all non-overlapping matches in a subject, + yielding Match objects. + """ + cdef bint is_patn_utf = PyUnicode_Check(self._patn.obj) + cdef bint is_subj_utf = PyUnicode_Check(subject) + if is_patn_utf ^ is_subj_utf: + patn_type = "string" if is_patn_utf else "bytes-like" + subj_type = "string" if is_subj_utf else "bytes-like" + raise ValueError(f"Cannot use a {patn_type} pattern with a {subj_type} subject") + + subj = get_buffer(subject) + return Scanner._from_data(self, subj, offset) + + + def split(self, subject, maxsplit=0, offset=0): + """ + Split subject by occurances of our pattern. + + If capturing parentheses are used in pattern, then the text of all + groups in the pattern are also returned as part of the resulting list. + If maxsplit is nonzero, at most maxsplit splits occur, and the + remainder of the string is returned as the final element of the list. + + If there are capturing groups in the separator and it matches at the + start of the string, the result will start with an empty string. The + same holds for the end of the string. + + That way, separator components are always found at the same relative + indices within the result list. + + Empty matches for the pattern split the string only when not adjacent + to a previous empty match. + """ + output = [] + pos = n = 0 + for match in self.scan(subject, offset=offset): + start = match.start() + end = match.end() + if start != end: + output.append(subject[pos:start]) + output.extend(match.groups()) + pos = end + n += 1 + if 0 < maxsplit <= n: + break + output.append(subject[pos:]) + return output + + + @staticmethod + cdef (uint8_t *, size_t) _substitute( + pcre2_code_t *code, + Py_buffer *repl, + Py_buffer *subj, + size_t res_buf_len, + size_t ofst, + uint32_t opts, + pcre2_match_data_t *mtch, + int *rc): + """ Safe wrapper around calling PCRE2 function directly. + """ + cdef size_t res_len = res_buf_len + cdef uint8_t *res + res = malloc(res_len * sizeof(uint8_t)) + substitute_rc = pcre2_substitute( + code, + subj.buf, subj.len, + ofst, opts | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, mtch, NULL, + repl.buf, repl.len, + res, &res_len + ) + # Reattempt substitution, now with required size of buffer known. + if substitute_rc == PCRE2_ERROR_NOMEMORY: + free(res) + res = malloc(res_len * sizeof(uint8_t)) + substitute_rc = pcre2_substitute( + code, + subj.buf, subj.len, + ofst, opts, mtch, NULL, + repl.buf, repl.len, + res, &res_len + ) + # Capture return codes from both substitute attempts. + if substitute_rc < 0: + free(res) + free_buffer(subj) + free_buffer(repl) + rc[0] = substitute_rc + return NULL, 0 + + return res, res_len + + + def substitute( + self, replacement, subject, offset=0, suball=True, literal=False, low_memory=False + ): + """ Returns the string obtained by replaces matches in subject with a + replacement. Note that option bits can significantly change the + functions behavior. See the following PCRE2 documentation for a brief + overview of the relevant options: + http://pcre.org/current/doc/html/pcre2_substitute.html + """ + is_patn_utf = PyUnicode_Check(self._patn.obj) + is_subj_utf = PyUnicode_Check(subject) + is_repl_utf = PyUnicode_Check(replacement) + if is_subj_utf ^ is_repl_utf: + subj_type = "string" if is_subj_utf else "bytes-like" + repl_type = "string" if is_repl_utf else "bytes-like" + raise ValueError(f"Cannot use a {subj_type} subject with a {repl_type} replacement") + if is_patn_utf ^ is_subj_utf: + patn_type = "string" if is_patn_utf else "bytes-like" + subj_type = "string" if is_subj_utf else "bytes-like" + raise ValueError(f"Cannot use a {patn_type} pattern with a {subj_type} subject") + + # Convert Python objects to C types. + subj = get_buffer(subject) + repl = get_buffer(replacement) + cdef size_t obj_ofst = offset + cdef size_t ofst = obj_ofst + + # Fill options from flags + cdef uint32_t opts = 0 + if suball: + opts = opts | PCRE2_SUBSTITUTE_GLOBAL + if literal: + opts = opts | PCRE2_SUBSTITUTE_LITERAL + + # Always replace unmatched groups with an empty string to match behavior of re + opts = opts | PCRE2_SUBSTITUTE_UNSET_EMPTY + + if is_subj_utf: + ofst, obj_ofst = codepoint_to_codeunit(subj, obj_ofst, 0, 0) + + cdef size_t res_buf_len = 0 + if not low_memory: + res_buf_len = subj.len + (subj.len // 2) + + cdef int rc = 0 + res, res_len = Pattern._substitute( + self._code, repl, subj, res_buf_len, ofst, opts, NULL, &rc + ) + if res is NULL: + raise_from_rc(rc, None) + + # Clean up result and convert to unicode as appropriate. + result = (res)[:res_len] + result = result.strip(b"\x00") + if is_subj_utf: + result = result.decode("utf-8") + + free(res) + free_buffer(subj) + free_buffer(repl) + return result diff --git a/src/pcre2/scanner.pxd b/src/pcre2/scanner.pxd new file mode 100644 index 0000000..475228f --- /dev/null +++ b/src/pcre2/scanner.pxd @@ -0,0 +1,26 @@ +# -*- coding:utf-8 -*- + +# Standard libraries. +from cpython cimport Py_buffer +from libc.stdint cimport uint32_t + +# Local imports. +from .libpcre2 cimport * +from .pattern cimport Pattern + + +cdef class Scanner: + cdef Pattern _pattern + cdef Py_buffer *_subj + + cdef bint _is_crlf_newline + cdef bint _is_patn_utf + + cdef uint32_t _state_opts + cdef size_t _state_ofst + cdef size_t _state_obj_ofst + + @staticmethod + cdef Scanner _from_data( + Pattern pattern, Py_buffer *subject, size_t offset + ) diff --git a/src/pcre2/scanner.pyx b/src/pcre2/scanner.pyx new file mode 100644 index 0000000..b6da2b1 --- /dev/null +++ b/src/pcre2/scanner.pyx @@ -0,0 +1,173 @@ +# -*- coding:utf-8 -*- + +# Standard libraries. +from libc.stdint cimport uint32_t +from libc.stdlib cimport malloc, free +from cpython cimport Py_buffer +from cpython cimport array +from cpython.unicode cimport PyUnicode_Check +from cpython.memoryview cimport PyMemoryView_FromMemory + +# Local imports. +from .utils cimport * +from .libpcre2 cimport * +from .match cimport Match +from .pattern cimport Pattern +from .consts import BsrChar, NewlineChar + + +cdef class Scanner: + """ Iterator object that scans a subject all non-overlapping matches of a + pattern. Attributes defined in scanner.pxd, see below for an overview: + _pattern: Pattern object to use for matching + _subj: Subject to scan + _is_crlf_newline: Whether the character sequence CRLF denotes a newline + _is_patn_utf: Whether the pattern was compiled with UTF support + _state_opts: Options to pass to match + _state_ofst: Byte offset to match at + _state_obj_ofst: Object offset to match at + """ + + + # =================================== # + # Lifetime management # + # =================================== # + + def __cinit__(self): + self._pattern = None + self._subj = NULL + + self._is_patn_utf = False + self._is_crlf_newline = False + + self._state_opts = 0 + self._state_ofst = 0 + self._state_obj_ofst = 0 + + + def __init__(self, *args, **kwargs): + # Prevent accidental instantiation from normal Python code since we + # cannot pass pointers into a Python constructor. + module = self.__class__.__module__ + qualname = self.__class__.__qualname__ + raise TypeError(f"Cannot create '{module}.{qualname}' instances") + + + def __dealloc__(self): + if self._subj is not NULL: + free_buffer(self._subj) + + + @staticmethod + cdef Scanner _from_data(Pattern pattern, Py_buffer *subj, size_t offset): + """ Factory function to create Scanner objects from C-type fields. The + ownership of the given pointers are stolen, which causes the extension + type to free them when the object is deallocated. + """ + # Fast call to __new__() that bypasses the __init__() constructor. + cdef Scanner scanner = Scanner.__new__(Scanner) + scanner._pattern = pattern + scanner._subj = subj + + patn_opts = Pattern._info_uint(pattern._code, PCRE2_INFO_ALLOPTIONS) + scanner._is_patn_utf = (patn_opts & PCRE2_UTF) != 0 + newline = Pattern._info_uint(pattern._code, PCRE2_INFO_NEWLINE) + scanner._is_crlf_newline = ( + newline == PCRE2_NEWLINE_ANY or + newline == PCRE2_NEWLINE_CRLF or + newline == PCRE2_NEWLINE_ANYCRLF + ) + scanner._state_opts = 0 + + # Compute and set byte equivalent offset. + if scanner._is_patn_utf: + ofst, obj_ofst = codepoint_to_codeunit(scanner._subj, offset, 0, 0) + scanner._state_ofst = ofst + scanner._state_obj_ofst = obj_ofst + else: + scanner._state_obj_ofst = offset + scanner._state_ofst = scanner._state_obj_ofst + return scanner + + + # ======================================== # + # Iteration implementation # + # ======================================== # + + def __iter__(self): + return self + + + def __next__(self): + """ Yields next match object found in subject. + """ + if self._state_obj_ofst > self._subj.len: + raise StopIteration + + # Attempt match of pattern onto subject. + match_rc = 0 + mtch = Pattern._match( + self._pattern._code, self._subj, self._state_ofst, self._state_opts, &match_rc + ) + + # Handle no matches in result. + if match_rc == PCRE2_ERROR_NOMATCH: + # Default match is not achored so if no match found at current offset, then there + # will not be any ahead either. + if self._state_opts == 0: + pcre2_match_data_free(mtch) + raise StopIteration + + # Reset options so empty strings can match at next offset. + self._state_opts = 0 + + # Increment to next character and handle possible CRLF newlines. + obj_ofst_increment = 1 + if self._is_crlf_newline and (self._state_ofst + 1) < self._subj.len: + if (self._subj.buf)[self._state_ofst:self._state_ofst + 2] == b"\r\n": + obj_ofst_increment += 1 + + # Convert indices accordingly. + if self._is_patn_utf: + self._state_ofst, self._state_obj_ofst = codepoint_to_codeunit( + self._subj, + self._state_obj_ofst + obj_ofst_increment, + self._state_ofst, + self._state_obj_ofst + ) + else: + self._state_obj_ofst = self._state_obj_ofst + obj_ofst_increment + self._state_ofst = self._state_obj_ofst + + pcre2_match_data_free(mtch) + return self.__next__() + + # Handle all other errors. + elif mtch is NULL or match_rc < 0: + pcre2_match_data_free(mtch) + raise_from_rc(match_rc, None) + + # If the match was successful. + else: + ovec_table = pcre2_get_ovector_pointer(mtch) + mtch_end = ovec_table[1] + + if self._state_ofst == mtch_end: + # If the matched string is empty ensure next is not. + self._state_opts = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED + else: + # Convert the end in the byte string to the end in the object. + self._state_opts = 0 + if self._is_patn_utf: + self._state_ofst, self._state_obj_ofst = codeunit_to_codepoint( + self._subj, mtch_end, self._state_ofst, self._state_obj_ofst + ) + else: + self._state_obj_ofst = mtch_end + self._state_ofst = self._state_obj_ofst + + # Create new buffer for match object to own + subj_copy = get_buffer(self._subj.obj) + return Match._from_data( + mtch, self._pattern, subj_copy, self._state_ofst, self._state_opts + ) diff --git a/src/pcre2/utils.pxd b/src/pcre2/utils.pxd new file mode 100755 index 0000000..1d4d898 --- /dev/null +++ b/src/pcre2/utils.pxd @@ -0,0 +1,22 @@ +# -*- coding:utf-8 -*- + +# Standard libraries. +from cpython cimport Py_buffer + + +cdef int free_buffer(Py_buffer *pybuf) + +cdef Py_buffer * get_buffer(object obj) except NULL + +cdef (size_t, size_t) codeunit_to_codepoint( + Py_buffer *pybuf, + size_t codeunit_idx, + size_t cur_codeunit_idx, size_t cur_codepoint_idx +) +cdef (size_t, size_t) codepoint_to_codeunit( + Py_buffer *pybuf, + size_t codepoint_idx, + size_t cur_codeunit_idx, size_t cur_codepoint_idx +) + +cdef void * raise_from_rc(int errorcode, object context_msg) except NULL diff --git a/src/pcre2/utils.pyx b/src/pcre2/utils.pyx new file mode 100755 index 0000000..1d7dde3 --- /dev/null +++ b/src/pcre2/utils.pyx @@ -0,0 +1,115 @@ +# -*- coding:utf-8 -*- + +# Standard libraries. +from libc.stdlib cimport malloc, free +from libc.stdint cimport uint8_t +from cpython cimport Py_buffer +from cpython.buffer cimport ( + PyObject_CheckBuffer, + PyBuffer_IsContiguous, + PyObject_GetBuffer, + PyBuffer_FillInfo, + PyBuffer_Release +) +from cpython.unicode cimport ( + PyUnicode_Check +) +cdef extern from "Python.h": + int PyUnicode_1BYTE_KIND + int PyUnicode_2BYTE_KIND + int PyUnicode_4BYTE_KIND + unsigned int PyUnicode_KIND(object o) + void *PyUnicode_DATA(object o) + const char * PyUnicode_AsUTF8AndSize(object unicode, Py_ssize_t *size) + +# Local imports. +from .libpcre2 cimport * +from .exceptions import LibraryError, CompileError, MatchError + + +cdef int free_buffer(Py_buffer *pybuf): + """ Safe free a buffer pointer, ensuring it first releases it's reference + """ + if pybuf is not NULL: + PyBuffer_Release(pybuf) + free(pybuf) + return 0 + + +cdef Py_buffer * get_buffer(object obj) except NULL: + """ Get a Python buffer from an object, encoding via UTF-8 if unicode + based + """ + cdef const char *sptr = NULL + cdef Py_ssize_t length = 0 + + pybuf = malloc(sizeof(Py_buffer)) + if not pybuf: + raise MemoryError() + + # Process unicode and derivative objects. + if PyUnicode_Check(obj): + sptr = PyUnicode_AsUTF8AndSize(obj, &length) + fill_buf_rc = PyBuffer_FillInfo(pybuf, obj, sptr, length, 1, 0) + if fill_buf_rc < 0: + free_buffer(pybuf) + raise ValueError("Could not fill internal buffer") + + # Handle all other bytes-like objects. + else: + if PyObject_CheckBuffer(obj): + get_buffer_rc = PyObject_GetBuffer(obj, pybuf, 0) + if not PyBuffer_IsContiguous(pybuf, b"A"): + free_buffer(pybuf) + raise ValueError("Bytes-like object must be contiguous") + else: + free(pybuf) + raise ValueError("Input must be string or bytes-like") + + return pybuf + + +cdef (size_t, size_t) codeunit_to_codepoint( + Py_buffer *pybuf, + size_t codeunit_idx, + size_t cur_codeunit_idx, size_t cur_codepoint_idx +): + """ Convert a code unit index to a code point index + """ + while cur_codeunit_idx < codeunit_idx: + if (((pybuf.buf)[cur_codeunit_idx]) & 0xC0) != 0x80: + cur_codepoint_idx += 1 + cur_codeunit_idx += 1 + return cur_codeunit_idx, cur_codepoint_idx + + +cdef (size_t, size_t) codepoint_to_codeunit( + Py_buffer *pybuf, + size_t codepoint_idx, + size_t cur_codeunit_idx, size_t cur_codepoint_idx +): + """ + """ + while cur_codepoint_idx < codepoint_idx: + cur_codeunit_idx += 1 + if (((pybuf.buf)[cur_codeunit_idx]) & 0xC0) != 0x80: + cur_codepoint_idx += 1 + return cur_codeunit_idx, cur_codepoint_idx + + +cdef void * raise_from_rc(int errorcode, object context_msg) except NULL: + """ Raise the appropriate error type from the given error code + + Raises one of the custom exception classes defined in this module. Each + exception corresponds to a set of error codes defined in PCRE2. Error + messages are retrieved from PCRE2. + """ + # Match against error code classes. + if errorcode > 0: + raise CompileError(errorcode, context_msg) + + elif errorcode == PCRE2_ERROR_NOMATCH or errorcode == PCRE2_ERROR_PARTIAL: + raise MatchError(errorcode, context_msg) + + else: + raise LibraryError(errorcode, context_msg) diff --git a/tests/test_groups.py b/tests/test_groups.py new file mode 100644 index 0000000..2fcfd2e --- /dev/null +++ b/tests/test_groups.py @@ -0,0 +1,14 @@ +import pytest +import pcre2 +from pcre2.exceptions import CompileError, MatchError, LibraryError + +def test_match_groups(): + assert pcre2.match('a', 'a').groups() == () + assert pcre2.match('(a)', 'a').groups() == ('a',) + + assert pcre2.match(b'a', b'a').groups() == () + assert pcre2.match(b'(a)', b'a').groups() == (b'a',) + + for a in ("\xe0", "\u0430", "\U0001d49c"): + assert pcre2.match(a, a).groups() == () + assert pcre2.match('(%s)' % a, a).groups() == (a,) diff --git a/tests/test_match.py b/tests/test_match.py new file mode 100644 index 0000000..8db098b --- /dev/null +++ b/tests/test_match.py @@ -0,0 +1,40 @@ +import pytest +import pcre2 +from pcre2.exceptions import CompileError, MatchError, LibraryError + + +# All tests should match successfully. +test_data_match_bounds = [ + (b".*", "aba•ba••ba•••b".encode(), 0, 0, 0, 0, 26), + (".*", "aba•ba••ba•••b", 0, 0, 0, 0, 14), +] +@pytest.mark.parametrize("pattern,subject,options,offset,group,start,end", test_data_match_bounds) +def test_match_bounds(pattern, subject, options, offset, group, start, end): + p = pcre2.compile(pattern, options=options) + m = p.match(subject, offset=offset) + assert (m.start(group), m.end(group)) == (start, end) + + +test_data_match_substring = [ + (b".*", "aba•ba••ba•••b".encode(), 0, 0, "aba•ba••ba•••b".encode()), + (".*", "aba•ba••ba•••b", 0, 0, "aba•ba••ba•••b"), +] +@pytest.mark.parametrize("pattern,subject,options,offset,substring", test_data_match_substring) +def test_match_substring(pattern, subject, options, offset, substring): + p = pcre2.compile(pattern, options=options) + m = p.match(subject, offset=offset) + assert m.substring() == substring + + +test_data_match_expand = [ + (b"[abc]*", b"", b"dabacbaccbacccb", 0, 0, b"dabacbaccbacccb"), + ("[abc]*", "", "dabacbaccbacccb", 0, 0, "dabacbaccbacccb"), + ("[abc]*", "", "dabacbaccbacccb", 0, 1, "d"), +] +@pytest.mark.parametrize( + "pattern,replacement,subject,options,offset,result", test_data_match_expand +) +def test_match_expand(pattern, replacement, subject, options, offset, result): + p = pcre2.compile(pattern, options=options) + m = p.match(subject, offset=offset) + assert m.expand(replacement) == result \ No newline at end of file diff --git a/tests/test_pattern.py b/tests/test_pattern.py new file mode 100644 index 0000000..d953228 --- /dev/null +++ b/tests/test_pattern.py @@ -0,0 +1,231 @@ +import pytest +import pcre2 +from pcre2.exceptions import CompileError, MatchError, LibraryError +from pcre2.consts import CompileOption + + +test_data_pattern_compile_success = [ + (b"a+b+c*d*", 0, "SUCCESS"), + (b"(?a+b+)c*d*", 0, "SUCCESS"), + (b"(?a+b+))c*d*", 0, "COMPILE_ERROR"), + ("å+∫+ç*∂*".encode(), 0, "SUCCESS"), + ("a+b+c*d*", 0, "SUCCESS"), + ("(?a+b+)c*d*", 0, "SUCCESS"), + ("(?a+b+))c*d*", 0, "COMPILE_ERROR"), + ("(?<a+b+)c*d*", 0, "COMPILE_ERROR"), + ("(?a+b+)c*d*(?a+b+)", 0, "COMPILE_ERROR"), + ("(?a+b+)c*d*(?a+b+)", pcre2.CompileOption.DUPNAMES, "SUCCESS"), + ("å+∫+ç*∂*", 0, "SUCCESS"), + ("(?<ƒøø>a+b+)c*d*", 0, "SUCCESS"), +] +@pytest.mark.parametrize("pattern,options,return_code", test_data_pattern_compile_success) +def test_pattern_compile_success(pattern, options, return_code): + try: + p = pcre2.compile(pattern, options=options) + rc = "SUCCESS" + assert p.jit_size == 0 + except CompileError as e: + rc = "COMPILE_ERROR" + except LibraryError as e: + rc = "LIB_ERROR" + assert rc == return_code + +@pytest.mark.parametrize("pattern,options,return_code", test_data_pattern_compile_success) +def test_pattern_jit_compile_success(pattern, options, return_code): + try: + p = pcre2.compile(pattern, options=options, jit=True) + rc = "SUCCESS" + assert p.jit_size > 0 + except CompileError as e: + rc = "COMPILE_ERROR" + except LibraryError as e: + rc = "LIB_ERROR" + assert rc == return_code + + +test_data_pattern_name_dict = [ + (b"(?a+b+)c*d*", 0, {1: b"foo"}), + ("(?a+b+)c*d*", 0, {1: "foo"}), + ("(?<ƒøø>a+b+)c*d*", 0, {1: "ƒøø"}), + ("(?a+b+)c*d*(?a+b+)", 0, {1: "foo", 2: "bar"}), + ("(?a+b+)c*(.+)d*(?a+b+)", 0, {1: "foo", 3: "bar"}), + ("(?a+b+)c*d*(?a+b+)", pcre2.CompileOption.DUPNAMES, {1: "foo", 2: "foo"}), +] +@pytest.mark.parametrize("pattern,options,name_dict", test_data_pattern_name_dict) +def test_pattern_name_dict(pattern, options, name_dict): + p = pcre2.compile(pattern, options=options) + assert p.name_dict() == name_dict + + +test_data_pattern_match_success = [ + (b".*", b"abacbaccbacccb", 0, 0, "SUCCESS"), + (".*", "abacbaccbacccb", 0, 0, "SUCCESS"), + ("ac{3,}b", "abacbaccbacccb", 0, 0, "SUCCESS"), + ("a•{3,}b", "aba•ba••ba•••b", 0, 0, "SUCCESS"), + ("ab", "abacbaccbacccb", 0, 2, "MATCH_ERROR"), + ("((((((((((((((()))))))))))))))", "", 0, 0, "SUCCESS"), +] +@pytest.mark.parametrize( + "pattern,subject,options,offset,return_code", test_data_pattern_match_success +) +def test_pattern_match_success(pattern, subject, options, offset, return_code): + p = pcre2.compile(pattern, options=options) + try: + m = p.match(subject, offset=offset) + rc = "SUCCESS" + except MatchError as e: + rc = "MATCH_ERROR" + except LibraryError as e: + rc = "LIB_ERROR" + assert rc == return_code + + +test_data_pattern_scan_length = [ + (b".+", b"abacbaccbacccb", 0, 1), + (b".*", b"abacbaccbacccb", 0, 2), + (".+", "abacbaccbacccb", 0, 1), + (".*", "abacbaccbacccb", 0, 2), + ("[abc]*", "dabacbaccbacccb", 0, 3), + ("ac{2,}b", "abacbaccbacccb", 0, 2), + ("a•{2,}b", "aba•ba••ba•••b", 0, 2), + ("a•*b", "aba•ba••ba•••b", 0, 4), + ("ab", "abacbaccbacccb", 2, 0), +] +@pytest.mark.parametrize( + "pattern,subject,offset,iter_length", test_data_pattern_scan_length +) +def test_pattern_scan_length(pattern, subject, offset, iter_length): + p = pcre2.compile(pattern) + s = p.scan(subject, offset=offset) + assert len(list(iter(s))) == iter_length + + +test_pattern_substitute = [ + (b"[abc]*", b"", b"dabacbaccbacccb", False, False, 0, b"dabacbaccbacccb"), + ("[abc]*", "", "dabacbaccbacccb", False, False, 0, "dabacbaccbacccb"), + ("[abc]*", "", "dabacbaccbacccb", False, False, 1, "d"), + ("a(•{2,})b", "a•b", "aba•ba••ba•••b", True, False, 0, "aba•ba•ba•b"), + ("a(•{2,})b", "a$1b", "aba•ba••ba•••b", True, True, 0, "aba•ba$1ba$1b"), +] +@pytest.mark.parametrize( + "pattern,replacement,subject,suball,literal,offset,result", test_pattern_substitute +) +def test_pattern_substitute(pattern, replacement, subject, suball, literal, offset, result): + p = pcre2.compile(pattern) + assert p.substitute(replacement, subject, suball=suball, literal=literal, offset=offset) == result + +def test_pattern_findall(): + p = pcre2.compile(r'(\w+)=(\d+)') + assert p.findall('set width=20 and height=10') == [('width=20', 'width'), ('height=10', 'height')] + s = bytes(range(128)).decode() + p2 = pcre2.compile(r'[0-9--1]') + assert p2.findall(s) == list('-./0123456789') + p3 = pcre2.compile(r'[%--1]') + assert p3.findall(s) == list("%&'()*+,-1") + p4 = pcre2.compile(r'[%--]') + assert p4.findall(s) == list("%&'()*+,-") + p5 = pcre2.compile(r'[0-9&&1]') + assert p5.findall(s) == list('&0123456789') + p6 = pcre2.compile(r'[\d&&1]') + assert p6.findall(s) == list('&0123456789') + p7 = pcre2.compile(r'[0-9||a]') + assert p7.findall(s) == list('0123456789a|') + p8 = pcre2.compile(r'[\d||a]') + assert p8.findall(s) == list('0123456789a|') + p9 = pcre2.compile(r'[0-9~~1]') + assert p9.findall(s) == list('0123456789~') + p10 = pcre2.compile(r'[\d~~1]') + assert p10.findall(s) == list('0123456789~') + p11 = pcre2.compile(r'[[0-9]|]') + assert p11.findall(s) == list('0123456789[]') + + for reps in '*', '+', '?', '{1}': + for mod in '', '?': + pattern = '.' + reps + mod + 'yz' + assert pcre2.compile(pattern, pcre2.S).findall('xyz') == ['xyz'], pattern + pattern = pattern.encode() + assert pcre2.compile(pattern, pcre2.S).findall(b'xyz') == [b'xyz'], pattern + + +def test_pattern_jit_findall(): + assert pcre2.findall(r'(\w+)=(\d+)', 'set width=20 and height=10') == [('width=20', 'width'), ('height=10', 'height')] + assert pcre2.findall(":+", "abc") == [] + assert pcre2.findall(":+", "a:b::c:::d") == [":", "::", ":::"] + assert pcre2.findall("(:+)", "a:b::c:::d") == [":", "::", ":::"] + + for x in ("\xe0", "\u0430", "\U0001d49c"): + xx = x * 2 + xxx = x * 3 + string = "a%sb%sc%sd" % (x, xx, xxx) + assert pcre2.findall("%s+" % x, string) == [x, xx, xxx] + assert pcre2.findall("(%s+)" % x, string) == [x, xx, xxx] + + assert len(pcre2.findall(r"\b", "a")) == 2 + assert len(pcre2.findall(r"\B", "a")) == 0 + assert len(pcre2.findall(r"\b", " ")) == 0 + assert len(pcre2.findall(r"\b", " ")) == 0 + assert len(pcre2.findall(r"\B", " ")) == 2 + + s = bytes(range(128)).decode() + assert pcre2.findall(r'[--1]', s) == list('-./01') + assert pcre2.findall(r'[&&1]', s) == list('&1') + assert pcre2.findall(r'[||1]', s) == list('1|') + assert pcre2.findall(r'[~~1]', s) == list('1~') + + assert pcre2.findall(r"(?i)(a)\1", "aa \u0100") == ['a'] + + assert pcre2.findall(r'a++', 'aab') == ['aa'] + assert pcre2.findall(r'a*+', 'aab') == ['aa', '', ''] + assert pcre2.findall(r'a?+', 'aab') == ['a', 'a', '', ''] + assert pcre2.findall(r'a{1,3}+', 'aab') == ['aa'] + + assert pcre2.findall(r'(?:ab)++', 'ababc') == ['abab'] + assert pcre2.findall(r'(?:ab)*+', 'ababc') == ['abab', '', ''] + assert pcre2.findall(r'(?:ab)?+', 'ababc') == ['ab', 'ab', '', ''] + assert pcre2.findall(r'(?:ab){1,3}+', 'ababc') == ['abab'] + + assert pcre2.findall(r'(?>a+)', 'aab') == ['aa'] + assert pcre2.findall(r'(?>a*)', 'aab') == ['aa', '', ''] + assert pcre2.findall(r'(?>a?)', 'aab') == ['a', 'a', '', ''] + assert pcre2.findall(r'(?>a{1,3})', 'aab') == ['aa'] + + assert pcre2.findall(r'(?>(?:ab)+)', 'ababc') == ['abab'] + assert pcre2.findall(r'(?>(?:ab)*)', 'ababc') == ['abab', '', ''] + assert pcre2.findall(r'(?>(?:ab)?)', 'ababc') == ['ab', 'ab', '', ''] + assert pcre2.findall(r'(?>(?:ab){1,3})', 'ababc') == ['abab'] + + import re + b = 'y\u2620y\u2620y'.encode('utf-8') + assert len(pcre2.findall(re.escape('\u2620'.encode('utf-8')), b)) == 2 + + +def test_pattern_split(): + pattern = "[\u002E\u3002\uFF0E\uFF61]" + assert pcre2.compile(pattern).split("a.b.c") == ['a','b','c'] + + +def test_pattern_jit_split(): + assert pcre2.split(":", ":a:b::c") == ['', 'a', 'b', '', 'c'] + assert pcre2.split(":+", ":a:b::c") == ['', 'a', 'b', 'c'] + assert pcre2.split("(:+)", ":a:b::c") == ['', ':', 'a', ':', 'b', '::', 'c'] + + assert pcre2.split(b":", b":a:b::c") == [b'', b'a', b'b', b'', b'c'] + assert pcre2.split(b":+", b":a:b::c") == [b'', b'a', b'b', b'c'] + assert pcre2.split(b"(:+)", b":a:b::c") == [b'', b':', b'a', b':', b'b', b'::', b'c'] + + for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432", + "\U0001d49c\U0001d49e\U0001d4b5"): + string = ":%s:%s::%s" % (a, b, c) + assert pcre2.split(":", string) == ['', a, b, '', c] + assert pcre2.split(":+", string) == ['', a, b, c] + assert pcre2.split("(:+)", string) == ['', ':', a, ':', b, '::', c] + + assert pcre2.split("(?::+)", ":a:b::c") == ['', 'a', 'b', 'c'] + assert pcre2.split("([b:]+)", ":a:b::c") == ['', ':', 'a', ':b::', 'c'] + assert pcre2.split("(?:b)|(?::+)", ":a:b::c") == ['', 'a', '', '', 'c'] + + assert pcre2.split(":", ":a:b::c", 2) == ['', 'a', 'b::c'] + assert pcre2.split(":", ":a:b::c", maxsplit=2) == ['', 'a', 'b::c'] + assert pcre2.split(':', 'a:b:c:d', maxsplit=2) == ['a', 'b', 'c:d'] + assert pcre2.split("(:)", ":a:b::c", maxsplit=2) == ['', ':', 'a', ':', 'b::c'] + assert pcre2.split("(:+)", ":a:b::c", maxsplit=2) == ['', ':', 'a', ':', 'b::c']